Compare commits

...

22 Commits

Author SHA1 Message Date
renovate[bot]
4d6cc5f29b Update dependency io.ktor:ktor-client-cio to v3.4.1 2026-03-03 13:41:09 +00:00
Julien Lengrand-Lambert
89dbda55b8 Moves up to next version, and fixes demo-remote 2025-10-26 00:00:32 +01:00
julien Lengrand-Lambert
d982bc94cb Adding retro Java 17 compatibility
* Moving back to Java 17 compatibility. I'll change demo-remote a little later, because otherwise I have to break the project for some time.

* Prepares version 0.1.3
2025-10-26 00:43:04 +02:00
Julien Lengrand-Lambert
f829d56a43 Update publishing plugin to 0.34 2025-10-19 03:21:52 +02:00
renovate[bot]
0364ab5d0c Update plugin org.jetbrains.kotlinx.kover to v0.9.3 (#44)
Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com>
2025-10-19 03:18:31 +02:00
Julien Lengrand-Lambert
389f16fe1c Update contributing file 2025-10-19 03:17:21 +02:00
Julien Lengrand-Lambert
f2cbbe048d First version of contributing.md 2025-10-19 03:13:08 +02:00
Julien Lengrand-Lambert
4333d077c3 Updates remote demo 2025-10-19 03:09:01 +02:00
Julien Lengrand-Lambert
20dfc326c7 Prepares 0.1.3 2025-10-19 02:59:19 +02:00
Julien Lengrand-Lambert
3ec5410f94 Creates 0.1.2 2025-10-19 02:51:32 +02:00
julien Lengrand-Lambert
12de34aa60 Feat/updates 10 25 (#42)
* Update dependency com.fleeksoft.ksoup:ksoup-network to v0.2.5 (#40)

Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com>

* Update whole ksoup to 0.2.5

* Update dependency gradle to v8.14.3 (#37)

Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com>

* Update dependency org.junit:junit-bom to v5.14.0 (#36)

Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com>

* Update dependency gradle to v8.14.3 (#43)

Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com>

* Update plugin org.jetbrains.kotlin.jvm to v2.2.20 (#35)

Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com>

* Update ktor from scraper

* Update settings

* Update settings

* Update gradle/actions action to v4.4.4 (#31)

Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com>

* Adds claude init

* Upgrades to Java 24

---------

Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com>
2025-10-19 02:48:47 +02:00
Julien Lengrand-Lambert
e38c968151 Next version tagging 2025-06-09 09:13:53 +02:00
Julien Lengrand-Lambert
e2817cb15c Prepares release 2025-06-09 09:03:29 +02:00
julien Lengrand-Lambert
5f390de944 Generates HTML from Object (#30)
* Adds Generator
* Corrects README
2025-06-09 09:02:27 +02:00
Julien Lengrand-Lambert
58abce8cb1 Updates version 2025-06-08 19:49:30 +02:00
Julien Lengrand-Lambert
91da68172f Lining up the version 2025-06-08 14:23:58 +02:00
renovate[bot]
d7cef1714e Update dependency gradle to v8.14.2 (#28)
Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com>
2025-06-08 14:17:12 +02:00
renovate[bot]
9d94d22a5e Update dependency org.junit:junit-bom to v5.13.1 (#25)
Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com>
2025-06-08 14:17:02 +02:00
julien Lengrand-Lambert
3c0eed60a7 Makes library multiplatform (#27)
* Replaces JSoup by Ksoup
2025-06-08 14:16:48 +02:00
julien Lengrand-Lambert
5372fab21c Fix types (#22)
* Improves types
* Adds missing properties to music album
* Changes gender from String to Enum
* Changes URL to an actual URL
* Fix typo
* Adds scalable live testing on real data
* Uses OffsetDateTime for articles, videos and books
2025-06-03 00:36:03 +02:00
jetbrains-junie[bot]
79b169fa81 Initialize JetBrains Junie 🚀 (#24)
* feat(junie): added .junie workflow

* feat(junie): added .devcontainer.json

---------

Co-authored-by: jetbrains-junie[bot] <201638009+jetbrains-junie[bot]@users.noreply.github.com>
2025-06-03 00:12:24 +02:00
renovate[bot]
11a22550b7 Update dependency gradle to v8.14.1 (#23)
Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com>
2025-06-02 23:35:45 +02:00
29 changed files with 4632 additions and 98 deletions

View File

@@ -0,0 +1,12 @@
{
"name": "Java",
"image": "mcr.microsoft.com/devcontainers/java:1-21",
"features": {
"ghcr.io/devcontainers/features/java:1": {
"version": "none",
"installMaven": "true",
"mavenVersion": "3.8.6",
"installGradle": "true"
}
}
}

View File

@@ -34,7 +34,7 @@ jobs:
# Configure Gradle for optimal use in GitHub Actions, including caching of downloaded dependencies.
# See: https://github.com/gradle/actions/blob/main/setup-gradle/README.md
- name: Setup Gradle
uses: gradle/actions/setup-gradle@8379f6a1328ee0e06e2bb424dadb7b159856a326 # v4.4.0
uses: gradle/actions/setup-gradle@748248ddd2a24f49513d8f472f81c3a07d4d50e1 # v4.4.4
- name: Build with Gradle Wrapper
run: ./gradlew build
@@ -57,7 +57,7 @@ jobs:
distribution: 'temurin'
- name: Setup Gradle
uses: gradle/actions/setup-gradle@8379f6a1328ee0e06e2bb424dadb7b159856a326 # v4.4.0
uses: gradle/actions/setup-gradle@748248ddd2a24f49513d8f472f81c3a07d4d50e1 # v4.4.4
- name: Build with Gradle Wrapper
run: ./gradlew koverXmlReport
@@ -88,5 +88,5 @@ jobs:
# Generates and submits a dependency graph, enabling Dependabot Alerts for all project dependencies.
# See: https://github.com/gradle/actions/blob/main/dependency-submission/README.md
- name: Generate and submit dependency graph
uses: gradle/actions/dependency-submission@8379f6a1328ee0e06e2bb424dadb7b159856a326 # v4.4.0
uses: gradle/actions/dependency-submission@748248ddd2a24f49513d8f472f81c3a07d4d50e1 # v4.4.4

22
.github/workflows/junie.yml vendored Normal file
View File

@@ -0,0 +1,22 @@
name: Junie
run-name: Junie run ${{ inputs.run_id }}
permissions:
contents: write
pull-requests: write
on:
workflow_dispatch:
inputs:
run_id:
description: "id of workflow process"
required: true
workflow_params:
description: "stringified params"
required: true
jobs:
call-workflow-passing-data:
uses: jetbrains-junie/junie-workflows/.github/workflows/ej-issue.yml@main
with:
workflow_params: ${{ inputs.workflow_params }}

6
.idea/AndroidProjectSystem.xml generated Normal file
View File

@@ -0,0 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="AndroidProjectSystem">
<option name="providerId" value="com.android.tools.idea.GradleProjectSystem" />
</component>
</project>

1
.idea/gradle.xml generated
View File

@@ -12,6 +12,7 @@
<option value="$PROJECT_DIR$/demo" />
<option value="$PROJECT_DIR$/demo-remote" />
<option value="$PROJECT_DIR$/opengraphkt" />
<option value="$PROJECT_DIR$/scrape-test" />
</set>
</option>
</GradleProjectSettings>

2
.idea/kotlinc.xml generated
View File

@@ -1,6 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="KotlinJpsPluginSettings">
<option name="version" value="2.1.21" />
<option name="version" value="2.2.20" />
</component>
</project>

2
.idea/misc.xml generated
View File

@@ -4,7 +4,7 @@
<component name="FrameworkDetectionExcludesConfiguration">
<file type="web" url="file://$PROJECT_DIR$" />
</component>
<component name="ProjectRootManager" version="2" languageLevel="JDK_21" default="true" project-jdk-name="temurin-21" project-jdk-type="JavaSDK">
<component name="ProjectRootManager" version="2" languageLevel="JDK_17" project-jdk-name="temurin-21" project-jdk-type="JavaSDK">
<output url="file://$PROJECT_DIR$/out" />
</component>
</project>

74
CLAUDE.md Normal file
View File

@@ -0,0 +1,74 @@
# CLAUDE.md
This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
## Project Overview
OpenGraphKt is a minimalist Kotlin multiplatform library for parsing Open Graph protocol tags from HTML. It wraps Ksoup (a Kotlin port of JSoup) to extract and structure Open Graph metadata.
**Current Status**: Pre-alpha - Protocol implementation is complete for `og:` tags, but type system needs refinement.
## Project Structure
This is a multi-module Gradle project:
- `opengraphkt/` - Core library module (published to Maven Central as `fr.lengrand:opengraphkt`)
- `demo/` - Local file parsing examples
- `demo-remote/` - Remote URL parsing examples (see Main.kt for usage)
- `scrape-test/` - Testing/scraping utilities
## Common Commands
### Build and Test
```bash
./gradlew build # Build all modules
./gradlew test # Run all tests
./gradlew :opengraphkt:test # Run tests for core library only
```
### Code Coverage
```bash
./gradlew koverXmlReport # Generate XML coverage report
./gradlew koverVerify # Verify coverage meets 70% minimum threshold
```
### Publishing
```bash
./gradlew publishToMavenLocal # Publish to local Maven repo for testing
```
## Architecture
### Core Components
**Parser (`Parser.kt`)**: Main entry point that accepts multiple input types:
- `parse(url: URL)` - Fetches and parses remote HTML
- `parse(html: String)` - Parses raw HTML string
- `parse(file: File)` - Parses local HTML file
- `parse(document: Document)` - Parses Ksoup Document
The parser extracts `meta[property^=og:]` tags and builds structured data models.
**Data Models (`Models.kt`)**: Type-safe representations of Open Graph data:
- `Data` - Main container with `isValid()` method checking required fields (title, type, image, url)
- Base types: `Image`, `Video`, `Audio`
- Content-specific types: `Article`, `Book`, `Profile`
- Music types: `MusicSong`, `MusicAlbum`, `MusicPlaylist`, `MusicRadioStation`
- Video types: `VideoMovie`, `VideoEpisode`
### Key Implementation Details
**Tag Grouping**: Tags are grouped by namespace (prefix before first colon) to handle structured properties like `og:image:width`, `og:image:height` that belong to the preceding `og:image` tag.
**Date Handling**: ISO 8601 datetime parsing with fallback for date-only formats (appends `T00:00:00Z`).
**Structured Property Association**: Images/Videos/Audio with their metadata (width, height, type, etc.) are associated by parsing sequential tags - each base tag (`og:image`) is paired with following attribute tags (`og:image:width`) until the next base tag.
## Development Notes
- **JVM Toolchain**: Java 17 (see `jvmToolchain(17)` in build files)
- **Minimum Java Version**: Java 17 (required by Ksoup dependency)
- **Testing**: CI matrix tests on Java 17 and 23 via GitHub Actions
- **Dependencies**: Core library uses Ksoup (v0.2.5) for HTML parsing and network requests
- **Maven Coordinates**: Group `fr.lengrand`, artifact `opengraphkt`, currently at `0.1.2-SNAPSHOT`
- **Code Coverage**: Kover plugin enforces 70% minimum coverage threshold

158
CONTRIBUTING.md Normal file
View File

@@ -0,0 +1,158 @@
# Contributing to OpenGraphKt
Thank you for your interest in contributing to OpenGraphKt! This document provides guidelines and instructions for contributing to the project.
## Development Setup
### Prerequisites
- Java 17 or later (JVM toolchain configured for Java 17)
- Gradle 8.14.3+ (wrapper included)
- Git
### Getting Started
1. Clone the repository:
```bash
git clone https://github.com/jlengrand/OpenGraphKt.git
cd OpenGraphKt
```
2. Build the project:
```bash
./gradlew build
```
3. Run tests:
```bash
./gradlew test
```
4. Check code coverage:
```bash
./gradlew koverXmlReport
./gradlew koverVerify # Enforces 70% minimum coverage
```
## Project Structure
- `opengraphkt/` - Core library module (published to Maven Central)
- `demo/` - Local file parsing examples
- `demo-remote/` - Remote URL parsing examples using published artifact
- `scrape-test/` - Testing/scraping utilities
## Making Changes
### Code Style
- Follow Kotlin coding conventions
- Use meaningful variable and function names
- Add KDoc comments for public APIs
### Testing
- Write tests for all new functionality
- Maintain minimum 70% code coverage (enforced by Kover)
- Run tests locally before submitting PR: `./gradlew test`
### Commits
- Write clear, concise commit messages
- Reference issue numbers when applicable
- Keep commits focused and atomic
## Submitting Changes
1. Fork the repository
2. Create a feature branch: `git checkout -b feature/your-feature-name`
3. Make your changes
4. Run tests and ensure coverage: `./gradlew test koverVerify`
5. Commit your changes
6. Push to your fork
7. Submit a Pull Request
## Publishing New Versions
This section is for maintainers only.
### Version Numbering
OpenGraphKt follows [Semantic Versioning](https://semver.org/):
- **MAJOR** version for incompatible API changes
- **MINOR** version for backwards-compatible functionality additions
- **PATCH** version for backwards-compatible bug fixes
### Pre-Release Checklist
1. **Update version number** in `opengraphkt/build.gradle.kts`:
```kotlin
version = "0.1.3-SNAPSHOT" // Change to "0.1.3" for release
```
2. **Update demo-remote dependency** in `demo-remote/build.gradle.kts`:
```kotlin
dependencies {
implementation("fr.lengrand:opengraphkt:0.1.3") // Match release version
}
```
3. **Run full test suite**:
```bash
./gradlew clean build test koverVerify
```
4. **Test demo applications**:
```bash
./gradlew :demo:run
./gradlew :demo-remote:run
```
### Publishing to Maven Central
The project uses the `com.vanniktech.maven.publish` plugin for publishing. Publishing is automated through GitHub Actions.
1. **Bump version ** in `opengraphkt/build.gradle.kts`:
```kotlin
version = "0.1.4"
```
2. **Publish to local Maven for testing** (optional):
```bash
./gradlew publishToMavenLocal
```
3. **Create GitHub release** (this triggers the publishing workflow):
- Go to GitHub repository → Releases → "Draft a new release"
- Click "Choose a tag" and create a new tag (e.g., `v0.1.3`)
- Set the release title (e.g., `v0.1.3`)
- Add release notes describing changes
- Click "Publish release"
4. **GitHub Actions will automatically**:
- Build the project
- Run tests
- Publish to Maven Central
- The workflow is triggered automatically when you create a new release
5. After a few minutes, you will see the new version in [Maven Central](https://mvnrepository.com/artifact/fr.lengrand/opengraphkt). You can also directly check the real-time status on [Central Sonartype](https://central.sonatype.com/publishing/deployments).
### Post-Release Steps
1. **Bump version to next SNAPSHOT** in `opengraphkt/build.gradle.kts`:
```kotlin
version = "0.1.5-SNAPSHOT"
```
2. **Commit version bump**:
```bash
git commit -am "Bump version to 0.1.5-SNAPSHOT"
git push
```
## Questions?
If you have questions or need help, please:
- Open an issue on GitHub
- Check existing issues and discussions
Thank you for contributing to OpenGraphKt!

View File

@@ -9,13 +9,12 @@
![GitHub code size in bytes](https://img.shields.io/github/languages/code-size/jlengrand/OpenGraphKt)
[OpenGraphKt](https://github.com/jlengrand/OpenGraphKt) is a minimalist Kotlin library to work with the [Open Graph tags](https://ogp.me/) protocol.
[OpenGraphKt](https://github.com/jlengrand/OpenGraphKt) is a minimalist Kotlin multiplatform library to work with the [Open Graph tags](https://ogp.me/) protocol.
OpenGraphKt is a tiny wrapper on top of JSoup.
## Current status
* Library can extract OpenGraph tags from HTML via a `URL`, `String` or `File` input.
* Current implementation is JVM only, due to the `JSoup` dependency.
* Protocol implementation is complete for `og:` tags, but types aren't fully correct (most types currently are `String`).
* Library should be considered in pre-alpha, use this in production at your own risks :).
@@ -28,7 +27,7 @@ In short :
* Add dependency to your Maven / Gradle file. For example :
```bash
implementation("fr.lengrand:opengraphkt:0.0.2")
implementation("fr.lengrand:opengraphkt:0.1.0")
```
* Enjoy:
@@ -44,11 +43,6 @@ println("Is valid: ${openGraphDataDoc.isValid()}")
// Is valid: true
```
## Dependencies
- [JSoup](https://jsoup.org/)
## Author
* [Julien Lengrand-Lambert](https://github.com/jlengrand)

View File

@@ -1,3 +1,3 @@
plugins {
kotlin("jvm") version "2.1.21" apply false
kotlin("jvm") version "2.2.20" apply false
}

View File

@@ -10,7 +10,7 @@ repositories {
}
dependencies {
implementation("fr.lengrand:opengraphkt:0.0.2")
implementation("fr.lengrand:opengraphkt:0.1.3")
testImplementation(kotlin("test"))
}
@@ -19,9 +19,9 @@ tasks.test {
}
kotlin {
jvmToolchain(23)
jvmToolchain(17)
}
application {
mainClass = "fr.lengrand.opengraphkt.MainKt"
mainClass = "fr.lengrand.opengraphktremote.MainKt"
}

View File

@@ -12,7 +12,9 @@ repositories {
}
dependencies {
implementation("org.jsoup:jsoup:1.20.1")
implementation("com.fleeksoft.ksoup:ksoup:0.2.5")
implementation("com.fleeksoft.ksoup:ksoup-kotlinx:0.2.5")
implementation("com.fleeksoft.ksoup:ksoup-network:0.2.5")
implementation(project(":opengraphkt"))
testImplementation(kotlin("test"))
}
@@ -22,7 +24,7 @@ tasks.test {
}
kotlin {
jvmToolchain(23)
jvmToolchain(17)
}
application {

View File

@@ -1,6 +1,6 @@
package fr.lengrand.opengraphkt
import org.jsoup.Jsoup
import com.fleeksoft.ksoup.Ksoup
import java.io.File
import java.net.URI
@@ -25,7 +25,7 @@ fun main() {
println("\nExample 2: Parsing from File")
try {
val resourceUrl = object {}.javaClass.getResource("/example.html")
val resourceFile = File(resourceUrl.toURI())
val resourceFile = File(resourceUrl!!.toURI())
// Parse the file
val openGraphData = parser.parse(resourceFile)
@@ -66,7 +66,7 @@ fun main() {
// Example 4: Parse Open Graph data from a Jsoup Document
println("\nExample 4: Parsing from JSoup Document")
val doc = Jsoup.parse(html)
val doc = Ksoup.parse(html)
val openGraphDataDoc = parser.parse(doc)
println("Title: ${openGraphDataDoc.title}")

View File

@@ -1,6 +1,6 @@
distributionBase=GRADLE_USER_HOME
distributionPath=wrapper/dists
distributionUrl=https\://services.gradle.org/distributions/gradle-8.14-bin.zip
distributionUrl=https\://services.gradle.org/distributions/gradle-8.14.3-bin.zip
networkTimeout=10000
validateDistributionUrl=true
zipStoreBase=GRADLE_USER_HOME

View File

@@ -1,20 +1,20 @@
import com.vanniktech.maven.publish.SonatypeHost
plugins {
kotlin("jvm")
id("com.vanniktech.maven.publish") version "0.32.0"
id("org.jetbrains.kotlinx.kover") version "0.9.1"
id("com.vanniktech.maven.publish") version "0.34.0"
id("org.jetbrains.kotlinx.kover") version "0.9.3"
}
group = "fr.lengrand"
version = "0.0.3-SNAPSHOT"
version = "0.1.4-SNAPSHOT"
repositories {
mavenCentral()
}
dependencies {
implementation("org.jsoup:jsoup:1.20.1")
implementation("com.fleeksoft.ksoup:ksoup:0.2.5")
implementation("com.fleeksoft.ksoup:ksoup-kotlinx:0.2.5")
implementation("com.fleeksoft.ksoup:ksoup-network:0.2.5")
testImplementation(kotlin("test"))
}
@@ -33,12 +33,20 @@ tasks.jar {
}
}
java {
sourceCompatibility = JavaVersion.VERSION_17
targetCompatibility = JavaVersion.VERSION_17
}
kotlin {
jvmToolchain(23)
jvmToolchain(17)
compilerOptions {
jvmTarget.set(org.jetbrains.kotlin.gradle.dsl.JvmTarget.JVM_17)
}
}
mavenPublishing {
publishToMavenCentral(SonatypeHost.CENTRAL_PORTAL)
publishToMavenCentral()
signAllPublications()

View File

@@ -0,0 +1,337 @@
package fr.lengrand.opengraphkt
import java.time.OffsetDateTime
/**
* A generator for Open Graph protocol HTML meta tags.
*
* This class converts an OpenGraph Data object into HTML meta tags according to the Open Graph protocol specification.
* It can be used to generate the appropriate meta tags for embedding in HTML documents.
*
* @see <a href="https://ogp.me/">Open Graph Protocol</a>
*/
class Generator {
/**
* Generates HTML meta tags from an OpenGraph Data object.
*
* @param data The OpenGraph Data object to convert to HTML meta tags
* @return A string containing the HTML meta tags
*/
fun generate(data: Data): String {
val tags = mutableListOf<String>()
// Add basic metadata tags
addBasicMetaTags(data, tags)
// Add image tags
addImageTags(data.images, tags)
// Add video tags
addVideoTags(data.videos, tags)
// Add audio tags
addAudioTags(data.audios, tags)
// Add type-specific tags
when (data.getType()) {
Type.ARTICLE -> addArticleTags(data.article, tags)
Type.PROFILE -> addProfileTags(data.profile, tags)
Type.BOOK -> addBookTags(data.book, tags)
Type.MUSIC_SONG -> addMusicSongTags(data.musicSong, tags)
Type.MUSIC_ALBUM -> addMusicAlbumTags(data.musicAlbum, tags)
Type.MUSIC_PLAYLIST -> addMusicPlaylistTags(data.musicPlaylist, tags)
Type.MUSIC_RADIO_STATION -> addMusicRadioStationTags(data.musicRadioStation, tags)
Type.VIDEO_MOVIE, Type.VIDEO_TV_SHOW, Type.VIDEO_OTHER -> addVideoMovieTags(data.videoMovie, tags)
Type.VIDEO_EPISODE -> addVideoEpisodeTags(data.videoEpisode, tags)
else -> { /* No additional tags for other types */ }
}
return tags.joinToString("\n")
}
/**
* Adds basic Open Graph meta tags to the list.
*
* @param data The OpenGraph Data object
* @param tags The list to add the tags to
*/
private fun addBasicMetaTags(data: Data, tags: MutableList<String>) {
// Required properties
data.title?.let { tags.add(createMetaTag("og:title", it)) }
data.type?.let { tags.add(createMetaTag("og:type", it)) }
data.url?.let { tags.add(createMetaTag("og:url", it.toString())) }
// Optional properties
data.description?.let { tags.add(createMetaTag("og:description", it)) }
data.siteName?.let { tags.add(createMetaTag("og:site_name", it)) }
data.determiner?.let { tags.add(createMetaTag("og:determiner", it)) }
data.locale?.let { tags.add(createMetaTag("og:locale", it)) }
// Locale alternates
data.localeAlternate.forEach { locale ->
tags.add(createMetaTag("og:locale:alternate", locale))
}
}
/**
* Adds image meta tags to the list.
*
* @param images The list of Image objects
* @param tags The list to add the tags to
*/
private fun addImageTags(images: List<Image>, tags: MutableList<String>) {
images.forEach { image ->
image.url?.let { tags.add(createMetaTag("og:image", it)) }
image.secureUrl?.let { tags.add(createMetaTag("og:image:secure_url", it)) }
image.type?.let { tags.add(createMetaTag("og:image:type", it)) }
image.width?.let { tags.add(createMetaTag("og:image:width", it.toString())) }
image.height?.let { tags.add(createMetaTag("og:image:height", it.toString())) }
image.alt?.let { tags.add(createMetaTag("og:image:alt", it)) }
}
}
/**
* Adds video meta tags to the list.
*
* @param videos The list of Video objects
* @param tags The list to add the tags to
*/
private fun addVideoTags(videos: List<Video>, tags: MutableList<String>) {
videos.forEach { video ->
video.url?.let { tags.add(createMetaTag("og:video", it)) }
video.secureUrl?.let { tags.add(createMetaTag("og:video:secure_url", it)) }
video.type?.let { tags.add(createMetaTag("og:video:type", it)) }
video.width?.let { tags.add(createMetaTag("og:video:width", it.toString())) }
video.height?.let { tags.add(createMetaTag("og:video:height", it.toString())) }
video.duration?.let { tags.add(createMetaTag("og:video:duration", it.toString())) }
}
}
/**
* Adds audio meta tags to the list.
*
* @param audios The list of Audio objects
* @param tags The list to add the tags to
*/
private fun addAudioTags(audios: List<Audio>, tags: MutableList<String>) {
audios.forEach { audio ->
audio.url?.let { tags.add(createMetaTag("og:audio", it)) }
audio.secureUrl?.let { tags.add(createMetaTag("og:audio:secure_url", it)) }
audio.type?.let { tags.add(createMetaTag("og:audio:type", it)) }
}
}
/**
* Adds article-specific meta tags to the list.
*
* @param article The Article object
* @param tags The list to add the tags to
*/
private fun addArticleTags(article: Article?, tags: MutableList<String>) {
if (article == null) return
article.publishedTime?.let { tags.add(createMetaTag("og:article:published_time", formatDateTime(it))) }
article.modifiedTime?.let { tags.add(createMetaTag("og:article:modified_time", formatDateTime(it))) }
article.expirationTime?.let { tags.add(createMetaTag("og:article:expiration_time", formatDateTime(it))) }
article.section?.let { tags.add(createMetaTag("og:article:section", it)) }
article.authors.forEach { author ->
tags.add(createMetaTag("og:article:author", author))
}
article.tags.forEach { tag ->
tags.add(createMetaTag("og:article:tag", tag))
}
}
/**
* Adds profile-specific meta tags to the list.
*
* @param profile The Profile object
* @param tags The list to add the tags to
*/
private fun addProfileTags(profile: Profile?, tags: MutableList<String>) {
if (profile == null) return
profile.firstName?.let { tags.add(createMetaTag("og:profile:first_name", it)) }
profile.lastName?.let { tags.add(createMetaTag("og:profile:last_name", it)) }
profile.username?.let { tags.add(createMetaTag("og:profile:username", it)) }
profile.gender?.let { tags.add(createMetaTag("og:profile:gender", it.toString())) }
}
/**
* Adds book-specific meta tags to the list.
*
* @param book The Book object
* @param tags The list to add the tags to
*/
private fun addBookTags(book: Book?, tags: MutableList<String>) {
if (book == null) return
book.authors.forEach { author ->
tags.add(createMetaTag("og:book:author", author))
}
book.isbn?.let { tags.add(createMetaTag("og:book:isbn", it)) }
book.releaseDate?.let { tags.add(createMetaTag("og:book:release_date", formatDateTime(it))) }
book.tags.forEach { tag ->
tags.add(createMetaTag("og:book:tag", tag))
}
}
/**
* Adds music.song-specific meta tags to the list.
*
* @param musicSong The MusicSong object
* @param tags The list to add the tags to
*/
private fun addMusicSongTags(musicSong: MusicSong?, tags: MutableList<String>) {
if (musicSong == null) return
musicSong.duration?.let { tags.add(createMetaTag("og:music:duration", it.toString())) }
musicSong.album?.let { tags.add(createMetaTag("og:music:album", it)) }
musicSong.albumDisc?.let { tags.add(createMetaTag("og:music:album:disc", it.toString())) }
musicSong.albumTrack?.let { tags.add(createMetaTag("og:music:album:track", it.toString())) }
musicSong.musician.forEach { musician ->
tags.add(createMetaTag("og:music:musician", musician))
}
}
/**
* Adds music.album-specific meta tags to the list.
*
* @param musicAlbum The MusicAlbum object
* @param tags The list to add the tags to
*/
private fun addMusicAlbumTags(musicAlbum: MusicAlbum?, tags: MutableList<String>) {
if (musicAlbum == null) return
musicAlbum.songs.forEach { song ->
tags.add(createMetaTag("og:music:song", song))
}
musicAlbum.songDisc?.let { tags.add(createMetaTag("og:music:song:disc", it.toString())) }
musicAlbum.songTrack?.let { tags.add(createMetaTag("og:music:song:track", it.toString())) }
musicAlbum.musician.forEach { musician ->
tags.add(createMetaTag("og:music:musician", musician))
}
musicAlbum.releaseDate?.let { tags.add(createMetaTag("og:music:release_date", formatDateTime(it))) }
}
/**
* Adds music.playlist-specific meta tags to the list.
*
* @param musicPlaylist The MusicPlaylist object
* @param tags The list to add the tags to
*/
private fun addMusicPlaylistTags(musicPlaylist: MusicPlaylist?, tags: MutableList<String>) {
if (musicPlaylist == null) return
musicPlaylist.songs.forEach { song ->
tags.add(createMetaTag("og:music:song", song))
}
musicPlaylist.songDisc?.let { tags.add(createMetaTag("og:music:song:disc", it.toString())) }
musicPlaylist.songTrack?.let { tags.add(createMetaTag("og:music:song:track", it.toString())) }
musicPlaylist.creator?.let { tags.add(createMetaTag("og:music:creator", it)) }
}
/**
* Adds music.radio_station-specific meta tags to the list.
*
* @param musicRadioStation The MusicRadioStation object
* @param tags The list to add the tags to
*/
private fun addMusicRadioStationTags(musicRadioStation: MusicRadioStation?, tags: MutableList<String>) {
if (musicRadioStation == null) return
musicRadioStation.creator?.let { tags.add(createMetaTag("og:music:creator", it)) }
}
/**
* Adds video.movie-specific meta tags to the list.
*
* @param videoMovie The VideoMovie object
* @param tags The list to add the tags to
*/
private fun addVideoMovieTags(videoMovie: VideoMovie?, tags: MutableList<String>) {
if (videoMovie == null) return
videoMovie.actors.forEach { actor ->
tags.add(createMetaTag("og:video:actor", actor))
}
videoMovie.director.forEach { director ->
tags.add(createMetaTag("og:video:director", director))
}
videoMovie.writer.forEach { writer ->
tags.add(createMetaTag("og:video:writer", writer))
}
videoMovie.duration?.let { tags.add(createMetaTag("og:video:duration", it.toString())) }
videoMovie.releaseDate?.let { tags.add(createMetaTag("og:video:release_date", formatDateTime(it))) }
videoMovie.tags.forEach { tag ->
tags.add(createMetaTag("og:video:tag", tag))
}
}
/**
* Adds video.episode-specific meta tags to the list.
*
* @param videoEpisode The VideoEpisode object
* @param tags The list to add the tags to
*/
private fun addVideoEpisodeTags(videoEpisode: VideoEpisode?, tags: MutableList<String>) {
if (videoEpisode == null) return
videoEpisode.actors.forEach { actor ->
tags.add(createMetaTag("og:video:actor", actor))
}
videoEpisode.director.forEach { director ->
tags.add(createMetaTag("og:video:director", director))
}
videoEpisode.writer.forEach { writer ->
tags.add(createMetaTag("og:video:writer", writer))
}
videoEpisode.duration?.let { tags.add(createMetaTag("og:video:duration", it.toString())) }
videoEpisode.releaseDate?.let { tags.add(createMetaTag("og:video:release_date", formatDateTime(it))) }
videoEpisode.tags.forEach { tag ->
tags.add(createMetaTag("og:video:tag", tag))
}
videoEpisode.series?.let { tags.add(createMetaTag("og:video:series", it)) }
}
/**
* Creates an HTML meta tag with the given property and content.
*
* @param property The property attribute value
* @param content The content attribute value
* @return The HTML meta tag string
*/
private fun createMetaTag(property: String, content: String): String {
val escapedContent = content.replace("\"", "&quot;")
return "<meta property=\"$property\" content=\"$escapedContent\" />"
}
/**
* Formats an OffsetDateTime to a string suitable for OpenGraph tags.
*
* @param dateTime The OffsetDateTime to format
* @return The formatted date string in ISO-8601 format with 'Z' timezone indicator
*/
private fun formatDateTime(dateTime: OffsetDateTime): String {
return dateTime.toInstant().toString()
}
}

View File

@@ -1,5 +1,8 @@
package fr.lengrand.opengraphkt
import java.net.URL
import java.time.OffsetDateTime
/**
* Enum representing the different types of Open Graph objects.
*/
@@ -46,6 +49,21 @@ enum class Type {
}
}
enum class Gender {
MALE,
FEMALE;
companion object {
fun fromString(gender: String): Gender {
return valueOf(gender.uppercase())
}
}
override fun toString(): String {
return this.name.lowercase()
}
}
data class Tag(
val property: String,
val content: String,
@@ -60,15 +78,15 @@ data class Data(
// Basic metadata
val title: String?,
val type: String?,
val url: String?,
val url: URL?,
val description: String?,
// Other metadata
val siteName: String?,
val determiner: String?,
val locale: String?,
val locale: String?,
val localeAlternate: List<String>,
// Structured properties
val images: List<Image>,
val videos: List<Video>,
val audios: List<Audio>,
@@ -77,14 +95,10 @@ data class Data(
val article: Article?,
val profile: Profile?,
val book: Book?,
// Music types
val musicSong: MusicSong?,
val musicAlbum: MusicAlbum?,
val musicPlaylist: MusicPlaylist?,
val musicRadioStation: MusicRadioStation?,
// Video types
val videoMovie: VideoMovie?,
val videoEpisode: VideoEpisode?
) {
@@ -137,16 +151,19 @@ data class Audio(
val type: String?
)
/**
* * video.tv_show - same as video.movie
* * video.other - same as video.movie
*/
data class Article(
val publishedTime: String?,
val modifiedTime: String?,
val expirationTime: String?,
val section: String?,
val publishedTime: OffsetDateTime?,
val modifiedTime: OffsetDateTime?,
val expirationTime: OffsetDateTime?,
val authors: List<String>,
val section: String?,
val tags: List<String>
)
data class Book(
val authors: List<String>,
val isbn: String?,
val releaseDate: OffsetDateTime?,
val tags: List<String>
)
@@ -154,14 +171,7 @@ data class Profile(
val firstName: String?,
val lastName: String?,
val username: String?,
val gender: String?
)
data class Book(
val authors: List<String>,
val isbn: String?,
val releaseDate: String?,
val tags: List<String>
val gender: Gender?
)
data class MusicSong(
@@ -174,12 +184,16 @@ data class MusicSong(
data class MusicAlbum(
val songs: List<String>,
val songDisc: Int?,
val songTrack: Int?,
val musician: List<String>,
val releaseDate: String?
val releaseDate: OffsetDateTime?
)
data class MusicPlaylist(
val songs: List<String>,
val songDisc: Int?,
val songTrack: Int?,
val creator: String?
)
@@ -192,7 +206,7 @@ data class VideoMovie(
val director: List<String>,
val writer: List<String>,
val duration: Int?,
val releaseDate: String?,
val releaseDate: OffsetDateTime?,
val tags: List<String>
)
@@ -201,7 +215,7 @@ data class VideoEpisode(
val director: List<String>,
val writer: List<String>,
val duration: Int?,
val releaseDate: String?,
val releaseDate: OffsetDateTime?,
val tags: List<String>,
val series: String?
)

View File

@@ -1,10 +1,16 @@
package fr.lengrand.opengraphkt
import org.jsoup.Jsoup
import org.jsoup.nodes.Document
import org.jsoup.select.Elements
import com.fleeksoft.ksoup.Ksoup
import com.fleeksoft.ksoup.network.parseGetRequestBlocking
import com.fleeksoft.ksoup.nodes.Document
import com.fleeksoft.ksoup.parseFile
import com.fleeksoft.ksoup.select.Elements
import kotlinx.coroutines.runBlocking
import java.io.File
import java.net.URI
import java.net.URL
import java.time.OffsetDateTime
import java.time.format.DateTimeParseException
/**
* A comprehensive parser for Open Graph protocol tags.
@@ -17,11 +23,35 @@ import java.net.URL
*/
class Parser {
/**
* Parses a string in ISO 8601 format to an OffsetDateTime.
* Handles both date-only (YYYY-MM-DD) and date-time formats.
*
* @param dateTimeString The string to parse
* @return The parsed OffsetDateTime, or null if the string is null or cannot be parsed
*/
private fun parseDateTime(dateTimeString: String?): OffsetDateTime? {
if (dateTimeString == null) {
return null
}
// Either parse full input or as date only
return try {
OffsetDateTime.parse(dateTimeString)
} catch (_: DateTimeParseException) {
try {
OffsetDateTime.parse(dateTimeString + "T00:00:00Z")
} catch (_: DateTimeParseException) {
null
}
}
}
/**
* Extracts all Open Graph tags from a JSoup Document and returns a structured Data object.
*
* @param document The JSoup Document to parse
* @return An Data object containing all extracted Open Graph data
* @return A Data object containing all extracted Open Graph data
*/
fun parse(document: Document): Data {
val tags = document.select("meta[property^=og:]")
@@ -34,21 +64,20 @@ class Parser {
* Extracts all Open Graph tags from a URL and returns a structured Data object.
*
* @param url The URL to be parsed for Open Graph information.
* @return An Data object containing all extracted Open Graph data.
* @return A Data object containing all extracted Open Graph data.
*/
fun parse(url: URL) : Data {
val doc = Jsoup.connect(url.toString()).get()
return parse(doc)
return parse(Ksoup.parseGetRequestBlocking(url.toString()))
}
/**
* Extracts all Open Graph tags from a raw HTML String and returns a structured Data object.
*
* @param html The raw HTML String to be parsed for Open Graph information.
* @return An Data object containing all extracted Open Graph data.
* @return A Data object containing all extracted Open Graph data.
*/
fun parse(html: String) : Data {
val doc = Jsoup.parse(html)
val doc = Ksoup.parse(html)
return parse(doc)
}
@@ -57,10 +86,12 @@ class Parser {
*
* @param file The file to parse
* @param charset The charset to use for parsing (default is UTF-8)
* @return An Data object containing all extracted Open Graph data.
* @return A Data object containing all extracted Open Graph data.
*/
fun parse(file: File, charset: String = "UTF-8") : Data {
val doc = Jsoup.parse(file, charset)
val doc = runBlocking {
Ksoup.parseFile(file, file.absolutePath, charset)
}
return parse(doc)
}
@@ -100,7 +131,8 @@ class Parser {
// Build basic properties
val title = getFirstTagContent(tags, "title")
val type = getFirstTagContent(tags, "type")
val url = getFirstTagContent(tags, "url")
val urlString = getFirstTagContent(tags, "url")
val url = urlString?.let{URI(urlString).toURL()}
val description = getFirstTagContent(tags, "description")
val siteName = getFirstTagContent(tags, "site_name")
val determiner = getFirstTagContent(tags, "determiner")
@@ -329,9 +361,13 @@ class Parser {
return null
}
val publishedTime = articleTags.firstOrNull { it.property == "article:published_time" }?.content
val modifiedTime = articleTags.firstOrNull { it.property == "article:modified_time" }?.content
val expirationTime = articleTags.firstOrNull { it.property == "article:expiration_time" }?.content
val publishedTimeString = articleTags.firstOrNull { it.property == "article:published_time" }?.content
val modifiedTimeString = articleTags.firstOrNull { it.property == "article:modified_time" }?.content
val expirationTimeString = articleTags.firstOrNull { it.property == "article:expiration_time" }?.content
val publishedTime = parseDateTime(publishedTimeString)
val modifiedTime = parseDateTime(modifiedTimeString)
val expirationTime = parseDateTime(expirationTimeString)
val section = articleTags.firstOrNull { it.property == "article:section" }?.content
val authors = articleTags.filter { it.property == "article:author" }.map { it.content }
val tags = articleTags.filter { it.property == "article:tag" }.map { it.content }
@@ -350,7 +386,7 @@ class Parser {
* Builds an Profile object from profile-related tags.
*
* @param groupedTags The map of grouped Tag objects
* @return An Profile object, or null if no profile tags are found
* @return A Profile object, or null if no profile tags are found
*/
private fun buildProfile(groupedTags: Map<String, List<Tag>>): Profile? {
val profileTags = groupedTags.getOrDefault("profile", emptyList())
@@ -362,7 +398,8 @@ class Parser {
val firstName = profileTags.firstOrNull { it.property == "profile:first_name" }?.content
val lastName = profileTags.firstOrNull { it.property == "profile:last_name" }?.content
val username = profileTags.firstOrNull { it.property == "profile:username" }?.content
val gender = profileTags.firstOrNull { it.property == "profile:gender" }?.content
val genderString = profileTags.firstOrNull { it.property == "profile:gender" }?.content
val gender = genderString?.let(Gender::fromString)
return Profile(
firstName = firstName,
@@ -387,7 +424,8 @@ class Parser {
val authors = bookTags.filter { it.property == "book:author" }.map { it.content }
val isbn = bookTags.firstOrNull { it.property == "book:isbn" }?.content
val releaseDate = bookTags.firstOrNull { it.property == "book:release_date" }?.content
val releaseDateString = bookTags.firstOrNull { it.property == "book:release_date" }?.content
val releaseDate = parseDateTime(releaseDateString)
val tags = bookTags.filter { it.property == "book:tag" }.map { it.content }
return Book(
@@ -440,18 +478,23 @@ class Parser {
}
val songs = musicTags.filter { it.property == "music:song" }.map { it.content }
val songDisc = musicTags.firstOrNull { it.property == "music:song:disc" }?.content?.toIntOrNull()
val songTrack = musicTags.firstOrNull { it.property == "music:song:track" }?.content?.toIntOrNull()
val musicians = musicTags.filter { it.property == "music:musician" }.map { it.content }
val releaseDate = musicTags.firstOrNull { it.property == "music:release_date" }?.content
val releaseDateString = musicTags.firstOrNull { it.property == "music:release_date" }?.content
val releaseDate = parseDateTime(releaseDateString)
return MusicAlbum(
songs = songs,
songDisc = songDisc,
songTrack = songTrack,
musician = musicians,
releaseDate = releaseDate
)
}
/**
* Builds an MusicPlaylist object from music.playlist-related tags.
* Builds a MusicPlaylist object from music.playlist-related tags.
*
* @param groupedTags The map of grouped Tag objects
* @return An MusicPlaylist object, or null if no music.playlist tags are found
@@ -464,16 +507,20 @@ class Parser {
}
val songs = musicTags.filter { it.property == "music:song" }.map { it.content }
val songDisc = musicTags.firstOrNull { it.property == "music:song:disc" }?.content?.toIntOrNull()
val songTrack = musicTags.firstOrNull { it.property == "music:song:track" }?.content?.toIntOrNull()
val creator = musicTags.firstOrNull { it.property == "music:creator" }?.content
return MusicPlaylist(
songs = songs,
songDisc = songDisc,
songTrack = songTrack,
creator = creator
)
}
/**
* Builds an MusicRadioStation object from music.radio_station-related tags.
* Builds a MusicRadioStation object from music.radio_station-related tags.
*
* @param groupedTags The map of grouped Tag objects
* @return An MusicRadioStation object, or null if no music.radio_station tags are found
@@ -509,7 +556,8 @@ class Parser {
val directors = videoTags.filter { it.property == "video:director" }.map { it.content }
val writers = videoTags.filter { it.property == "video:writer" }.map { it.content }
val duration = videoTags.firstOrNull { it.property == "video:duration" }?.content?.toIntOrNull()
val releaseDate = videoTags.firstOrNull { it.property == "video:release_date" }?.content
val releaseDateString = videoTags.firstOrNull { it.property == "video:release_date" }?.content
val releaseDate = parseDateTime(releaseDateString)
val tags = videoTags.filter { it.property == "video:tag" }.map { it.content }
return VideoMovie(
@@ -539,7 +587,8 @@ class Parser {
val directors = videoTags.filter { it.property == "video:director" }.map { it.content }
val writers = videoTags.filter { it.property == "video:writer" }.map { it.content }
val duration = videoTags.firstOrNull { it.property == "video:duration" }?.content?.toIntOrNull()
val releaseDate = videoTags.firstOrNull { it.property == "video:release_date" }?.content
val releaseDateString = videoTags.firstOrNull { it.property == "video:release_date" }?.content
val releaseDate = parseDateTime(releaseDateString)
val tags = videoTags.filter { it.property == "video:tag" }.map { it.content }
val series = videoTags.firstOrNull { it.property == "video:series" }?.content
@@ -553,4 +602,4 @@ class Parser {
series = series
)
}
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -3,10 +3,18 @@ package fr.lengrand.opengraphkt
import org.junit.jupiter.api.Test
import org.junit.jupiter.api.io.TempDir
import java.io.File
import java.net.URL
import java.time.OffsetDateTime
import kotlin.test.assertEquals
import kotlin.test.assertNotNull
import kotlin.test.assertTrue
// Helper function to compare URL objects with String URLs
private fun assertUrlEquals(expected: String, actual: URL?) {
assertNotNull(actual)
assertEquals(expected, actual.toString())
}
class ParserTest {
private val parser = Parser()
@@ -21,6 +29,8 @@ class ParserTest {
<meta property="og:type" content="video.movie" />
<meta property="og:url" content="https://example.com/the-rock" />
<meta property="og:image" content="https://example.com/rock.jpg" />
<meta property="og:image:secure_url" content="https://secure.example.com/rock.jpg" />
<meta property="og:image:type" content="image/jpeg" />
<meta property="og:image:width" content="300" />
<meta property="og:image:height" content="200" />
<meta property="og:image:alt" content="A promotional image for The Rock" />
@@ -122,12 +132,18 @@ class ParserTest {
<meta property="og:type" content="website" />
<meta property="og:url" content="https://example.com/gallery" />
<meta property="og:image" content="https://example.com/image1.jpg" />
<meta property="og:image:secure_url" content="https://secure.example.com/image1.jpg" />
<meta property="og:image:type" content="image/jpeg" />
<meta property="og:image:width" content="800" />
<meta property="og:image:height" content="600" />
<meta property="og:image" content="https://example.com/image2.jpg" />
<meta property="og:image:secure_url" content="https://secure.example.com/image2.jpg" />
<meta property="og:image:type" content="image/png" />
<meta property="og:image:width" content="1024" />
<meta property="og:image:height" content="768" />
<meta property="og:image" content="https://example.com/image3.jpg" />
<meta property="og:image:secure_url" content="https://secure.example.com/image3.jpg" />
<meta property="og:image:type" content="image/gif" />
<meta property="og:image:width" content="1200" />
<meta property="og:image:height" content="900" />
<meta property="og:description" content="A gallery of images" />
@@ -145,18 +161,20 @@ class ParserTest {
// Verify that all required properties are extracted correctly
assertEquals("The Rock", openGraphData.title)
assertEquals("video.movie", openGraphData.type)
assertEquals("https://example.com/the-rock", openGraphData.url)
assertUrlEquals("https://example.com/the-rock", openGraphData.url)
// Verify that the OpenGraphData object is valid
assertTrue(openGraphData.isValid())
// Verify that all tags are extracted
assertEquals(18, openGraphData.tags.size)
assertEquals(20, openGraphData.tags.size)
// Verify image properties
assertEquals(1, openGraphData.images.size)
val image = openGraphData.images[0]
assertEquals("https://example.com/rock.jpg", image.url)
assertEquals("https://secure.example.com/rock.jpg", image.secureUrl)
assertEquals("image/jpeg", image.type)
assertEquals(300, image.width)
assertEquals(200, image.height)
assertEquals("A promotional image for The Rock", image.alt)
@@ -198,13 +216,15 @@ class ParserTest {
// Verify basic properties
assertEquals("Breaking News", openGraphData.title)
assertEquals("article", openGraphData.type)
assertEquals("https://example.com/news/breaking", openGraphData.url)
assertUrlEquals("https://example.com/news/breaking", openGraphData.url)
assertEquals("Latest breaking news", openGraphData.description)
// Verify article-specific properties
assertNotNull(openGraphData.article)
assertEquals("2023-01-01T00:00:00Z", openGraphData.article.publishedTime)
assertEquals("2023-01-02T12:00:00Z", openGraphData.article.modifiedTime)
assertNotNull(openGraphData.article.publishedTime)
assertEquals(OffsetDateTime.parse("2023-01-01T00:00:00Z"), openGraphData.article.publishedTime)
assertNotNull(openGraphData.article.modifiedTime)
assertEquals(OffsetDateTime.parse("2023-01-02T12:00:00Z"), openGraphData.article.modifiedTime)
assertEquals("News", openGraphData.article.section)
assertEquals(2, openGraphData.article.authors.size)
assertTrue(openGraphData.article.authors.contains("John Doe"))
@@ -221,7 +241,7 @@ class ParserTest {
// Verify basic properties
assertEquals("John Doe", openGraphData.title)
assertEquals("profile", openGraphData.type)
assertEquals("https://example.com/profile/johndoe", openGraphData.url)
assertUrlEquals("https://example.com/profile/johndoe", openGraphData.url)
assertEquals("John Doe's profile", openGraphData.description)
// Verify profile-specific properties
@@ -229,7 +249,7 @@ class ParserTest {
assertEquals("John", openGraphData.profile.firstName)
assertEquals("Doe", openGraphData.profile.lastName)
assertEquals("johndoe", openGraphData.profile.username)
assertEquals("male", openGraphData.profile.gender)
assertEquals(Gender.MALE, openGraphData.profile.gender)
}
@Test
@@ -239,7 +259,7 @@ class ParserTest {
// Verify basic properties
assertEquals("The Great Novel", openGraphData.title)
assertEquals("book", openGraphData.type)
assertEquals("https://example.com/books/great-novel", openGraphData.url)
assertUrlEquals("https://example.com/books/great-novel", openGraphData.url)
assertEquals("A great novel", openGraphData.description)
// Verify book-specific properties
@@ -247,7 +267,8 @@ class ParserTest {
assertEquals(1, openGraphData.book.authors.size)
assertEquals("Famous Author", openGraphData.book.authors.get(0))
assertEquals("1234567890123", openGraphData.book.isbn)
assertEquals("2023-01-01", openGraphData.book.releaseDate)
assertNotNull(openGraphData.book.releaseDate)
assertEquals(OffsetDateTime.parse("2023-01-01T00:00:00Z"), openGraphData.book.releaseDate)
assertEquals(2, openGraphData.book.tags.size)
assertTrue(openGraphData.book.tags.contains("fiction"))
assertTrue(openGraphData.book.tags.contains("novel"))
@@ -260,7 +281,7 @@ class ParserTest {
// Verify basic properties
assertEquals("Photo Gallery", openGraphData.title)
assertEquals("website", openGraphData.type)
assertEquals("https://example.com/gallery", openGraphData.url)
assertUrlEquals("https://example.com/gallery", openGraphData.url)
assertEquals("A gallery of images", openGraphData.description)
// Verify multiple images
@@ -268,16 +289,22 @@ class ParserTest {
// First image
assertEquals("https://example.com/image1.jpg", openGraphData.images[0].url)
assertEquals("https://secure.example.com/image1.jpg", openGraphData.images[0].secureUrl)
assertEquals("image/jpeg", openGraphData.images[0].type)
assertEquals(800, openGraphData.images[0].width)
assertEquals(600, openGraphData.images[0].height)
// Second image
assertEquals("https://example.com/image2.jpg", openGraphData.images[1].url)
assertEquals("https://secure.example.com/image2.jpg", openGraphData.images[1].secureUrl)
assertEquals("image/png", openGraphData.images[1].type)
assertEquals(1024, openGraphData.images[1].width)
assertEquals(768, openGraphData.images[1].height)
// Third image
assertEquals("https://example.com/image3.jpg", openGraphData.images[2].url)
assertEquals("https://secure.example.com/image3.jpg", openGraphData.images[2].secureUrl)
assertEquals("image/gif", openGraphData.images[2].type)
assertEquals(1200, openGraphData.images[2].width)
assertEquals(900, openGraphData.images[2].height)
}
@@ -293,13 +320,15 @@ class ParserTest {
// Verify basic properties
assertEquals("Breaking News", openGraphData.title)
assertEquals("article", openGraphData.type)
assertEquals("https://example.com/news/breaking", openGraphData.url)
assertUrlEquals("https://example.com/news/breaking", openGraphData.url)
assertEquals("Latest breaking news", openGraphData.description)
// Verify article-specific properties
assertNotNull(openGraphData.article)
assertEquals("2023-01-01T00:00:00Z", openGraphData.article.publishedTime)
assertEquals("2023-01-02T12:00:00Z", openGraphData.article.modifiedTime)
assertNotNull(openGraphData.article.publishedTime)
assertEquals(OffsetDateTime.parse("2023-01-01T00:00:00Z"), openGraphData.article.publishedTime)
assertNotNull(openGraphData.article.modifiedTime)
assertEquals(OffsetDateTime.parse("2023-01-02T12:00:00Z"), openGraphData.article.modifiedTime)
assertEquals("News", openGraphData.article.section)
assertEquals(2, openGraphData.article.authors.size)
assertTrue(openGraphData.article.authors.contains("John Doe"))
@@ -368,7 +397,7 @@ class ParserTest {
// Verify basic properties
assertEquals("The Matrix", openGraphData.title)
assertEquals("video.movie", openGraphData.type)
assertEquals("https://example.com/movies/the-matrix", openGraphData.url)
assertUrlEquals("https://example.com/movies/the-matrix", openGraphData.url)
assertEquals("A sci-fi action movie", openGraphData.description)
// Verify video.movie-specific properties
@@ -383,12 +412,76 @@ class ParserTest {
assertTrue(openGraphData.videoMovie.writer.contains("Lana Wachowski"))
assertTrue(openGraphData.videoMovie.writer.contains("Lilly Wachowski"))
assertEquals(136, openGraphData.videoMovie.duration)
assertEquals("1999-03-31", openGraphData.videoMovie.releaseDate)
assertNotNull(openGraphData.videoMovie.releaseDate)
assertEquals(OffsetDateTime.parse("1999-03-31T00:00:00Z"), openGraphData.videoMovie.releaseDate)
assertEquals(2, openGraphData.videoMovie.tags.size)
assertTrue(openGraphData.videoMovie.tags.contains("sci-fi"))
assertTrue(openGraphData.videoMovie.tags.contains("action"))
}
// Sample HTML with music.album-specific tags
private val musicAlbumHtml = """
<!DOCTYPE html>
<html>
<head>
<title>Music Album Example</title>
<meta property="og:title" content="Greatest Hits" />
<meta property="og:type" content="music.album" />
<meta property="og:url" content="https://example.com/albums/greatest-hits" />
<meta property="og:image" content="https://example.com/album-cover.jpg" />
<meta property="og:description" content="A collection of greatest hits" />
<meta property="og:music:song" content="Song 1" />
<meta property="og:music:song" content="Song 2" />
<meta property="og:music:song:disc" content="1" />
<meta property="og:music:song:track" content="1" />
<meta property="og:music:musician" content="Famous Musician" />
<meta property="og:music:musician" content="Another Musician" />
<meta property="og:music:release_date" content="2023-01-15T12:30:00Z" />
</head>
<body>
<h1>Greatest Hits</h1>
</body>
</html>
""".trimIndent()
@Test
fun `test parse with music album-specific tags`() {
val openGraphData = parser.parse(musicAlbumHtml)
// Verify basic properties
assertEquals("Greatest Hits", openGraphData.title)
assertEquals("music.album", openGraphData.type)
assertUrlEquals("https://example.com/albums/greatest-hits", openGraphData.url)
assertEquals("A collection of greatest hits", openGraphData.description)
// Verify music.album-specific properties
assertNotNull(openGraphData.musicAlbum)
assertEquals(2, openGraphData.musicAlbum.songs.size)
assertTrue(openGraphData.musicAlbum.songs.contains("Song 1"))
assertTrue(openGraphData.musicAlbum.songs.contains("Song 2"))
assertEquals(1, openGraphData.musicAlbum.songDisc)
assertEquals(1, openGraphData.musicAlbum.songTrack)
assertEquals(2, openGraphData.musicAlbum.musician.size)
assertTrue(openGraphData.musicAlbum.musician.contains("Famous Musician"))
assertTrue(openGraphData.musicAlbum.musician.contains("Another Musician"))
// Verify releaseDate is correctly parsed as OffsetDateTime
assertNotNull(openGraphData.musicAlbum.releaseDate)
assertEquals(OffsetDateTime.parse("2023-01-15T12:30:00Z"), openGraphData.musicAlbum.releaseDate)
}
@Test
fun `test parse with date-only release date`() {
// Create a modified version of the music album HTML with a date-only release date
val dateOnlyHtml = musicAlbumHtml.replace("2023-01-15T12:30:00Z", "2023-01-15")
val openGraphData = parser.parse(dateOnlyHtml)
// Verify releaseDate is correctly parsed as OffsetDateTime with default time
assertNotNull(openGraphData.musicAlbum)
assertNotNull(openGraphData.musicAlbum.releaseDate)
assertEquals(OffsetDateTime.parse("2023-01-15T00:00:00Z"), openGraphData.musicAlbum.releaseDate)
}
@Test
fun `test getType method returns correct enum values`() {
// Test video.movie type
@@ -407,6 +500,10 @@ class ParserTest {
val bookData = parser.parse(bookHtml)
assertEquals(Type.BOOK, bookData.getType())
// Test music.album type
val musicAlbumData = parser.parse(musicAlbumHtml)
assertEquals(Type.MUSIC_ALBUM, musicAlbumData.getType())
// Test website type (should return UNKNOWN as it's not in our enum)
val websiteData = parser.parse(multipleImagesHtml)
assertEquals(Type.WEBSITE, websiteData.getType())
@@ -419,4 +516,196 @@ class ParserTest {
val unkwownData = parser.parse(unknownTypeHtml)
assertEquals(Type.UNKNOWN, unkwownData.getType())
}
// Sample HTML with music.song-specific tags
private val musicSongHtml = """
<!DOCTYPE html>
<html>
<head>
<title>Music Song Example</title>
<meta property="og:title" content="Awesome Song" />
<meta property="og:type" content="music.song" />
<meta property="og:url" content="https://example.com/songs/awesome-song" />
<meta property="og:image" content="https://example.com/song-cover.jpg" />
<meta property="og:description" content="An awesome song" />
<meta property="og:music:duration" content="240" />
<meta property="og:music:album" content="Awesome Album" />
<meta property="og:music:album:disc" content="1" />
<meta property="og:music:album:track" content="3" />
<meta property="og:music:musician" content="Awesome Artist" />
<meta property="og:music:musician" content="Featured Artist" />
</head>
<body>
<h1>Awesome Song</h1>
</body>
</html>
""".trimIndent()
@Test
fun `test parse with music song-specific tags`() {
val openGraphData = parser.parse(musicSongHtml)
// Verify basic properties
assertEquals("Awesome Song", openGraphData.title)
assertEquals("music.song", openGraphData.type)
assertUrlEquals("https://example.com/songs/awesome-song", openGraphData.url)
assertEquals("An awesome song", openGraphData.description)
// Verify music.song-specific properties
assertNotNull(openGraphData.musicSong)
assertEquals(240, openGraphData.musicSong.duration)
assertEquals("Awesome Album", openGraphData.musicSong.album)
assertEquals(1, openGraphData.musicSong.albumDisc)
assertEquals(3, openGraphData.musicSong.albumTrack)
assertEquals(2, openGraphData.musicSong.musician.size)
assertTrue(openGraphData.musicSong.musician.contains("Awesome Artist"))
assertTrue(openGraphData.musicSong.musician.contains("Featured Artist"))
}
// Sample HTML with music.playlist-specific tags
private val musicPlaylistHtml = """
<!DOCTYPE html>
<html>
<head>
<title>Music Playlist Example</title>
<meta property="og:title" content="Awesome Playlist" />
<meta property="og:type" content="music.playlist" />
<meta property="og:url" content="https://example.com/playlists/awesome-playlist" />
<meta property="og:image" content="https://example.com/playlist-cover.jpg" />
<meta property="og:description" content="An awesome playlist" />
<meta property="og:music:song" content="Song 1" />
<meta property="og:music:song" content="Song 2" />
<meta property="og:music:song" content="Song 3" />
<meta property="og:music:song:disc" content="1" />
<meta property="og:music:song:track" content="1" />
<meta property="og:music:creator" content="Playlist Creator" />
</head>
<body>
<h1>Awesome Playlist</h1>
</body>
</html>
""".trimIndent()
@Test
fun `test parse with music playlist-specific tags`() {
val openGraphData = parser.parse(musicPlaylistHtml)
// Verify basic properties
assertEquals("Awesome Playlist", openGraphData.title)
assertEquals("music.playlist", openGraphData.type)
assertUrlEquals("https://example.com/playlists/awesome-playlist", openGraphData.url)
assertEquals("An awesome playlist", openGraphData.description)
// Verify music.playlist-specific properties
assertNotNull(openGraphData.musicPlaylist)
assertEquals(3, openGraphData.musicPlaylist.songs.size)
assertTrue(openGraphData.musicPlaylist.songs.contains("Song 1"))
assertTrue(openGraphData.musicPlaylist.songs.contains("Song 2"))
assertTrue(openGraphData.musicPlaylist.songs.contains("Song 3"))
assertEquals(1, openGraphData.musicPlaylist.songDisc)
assertEquals(1, openGraphData.musicPlaylist.songTrack)
assertEquals("Playlist Creator", openGraphData.musicPlaylist.creator)
}
// Sample HTML with music.radio_station-specific tags
private val musicRadioStationHtml = """
<!DOCTYPE html>
<html>
<head>
<title>Music Radio Station Example</title>
<meta property="og:title" content="Awesome Radio" />
<meta property="og:type" content="music.radio_station" />
<meta property="og:url" content="https://example.com/radio/awesome-radio" />
<meta property="og:image" content="https://example.com/radio-logo.jpg" />
<meta property="og:description" content="An awesome radio station" />
<meta property="og:music:creator" content="Radio Creator" />
</head>
<body>
<h1>Awesome Radio</h1>
</body>
</html>
""".trimIndent()
@Test
fun `test parse with music radio station-specific tags`() {
val openGraphData = parser.parse(musicRadioStationHtml)
// Verify basic properties
assertEquals("Awesome Radio", openGraphData.title)
assertEquals("music.radio_station", openGraphData.type)
assertUrlEquals("https://example.com/radio/awesome-radio", openGraphData.url)
assertEquals("An awesome radio station", openGraphData.description)
// Verify music.radio_station-specific properties
assertNotNull(openGraphData.musicRadioStation)
assertEquals("Radio Creator", openGraphData.musicRadioStation.creator)
}
// Sample HTML with video.episode-specific tags
private val videoEpisodeHtml = """
<!DOCTYPE html>
<html>
<head>
<title>Video Episode Example</title>
<meta property="og:title" content="Awesome Episode" />
<meta property="og:type" content="video.episode" />
<meta property="og:url" content="https://example.com/episodes/awesome-episode" />
<meta property="og:image" content="https://example.com/episode-thumbnail.jpg" />
<meta property="og:description" content="An awesome episode" />
<meta property="og:video:actor" content="Actor 1" />
<meta property="og:video:actor" content="Actor 2" />
<meta property="og:video:director" content="Director 1" />
<meta property="og:video:writer" content="Writer 1" />
<meta property="og:video:writer" content="Writer 2" />
<meta property="og:video:duration" content="45" />
<meta property="og:video:release_date" content="2023-05-15" />
<meta property="og:video:tag" content="drama" />
<meta property="og:video:tag" content="comedy" />
<meta property="og:video:series" content="Awesome Series" />
</head>
<body>
<h1>Awesome Episode</h1>
</body>
</html>
""".trimIndent()
@Test
fun `test parse with video episode-specific tags`() {
val openGraphData = parser.parse(videoEpisodeHtml)
// Verify basic properties
assertEquals("Awesome Episode", openGraphData.title)
assertEquals("video.episode", openGraphData.type)
assertUrlEquals("https://example.com/episodes/awesome-episode", openGraphData.url)
assertEquals("An awesome episode", openGraphData.description)
// Verify video.episode-specific properties
assertNotNull(openGraphData.videoEpisode)
assertEquals(2, openGraphData.videoEpisode.actors.size)
assertTrue(openGraphData.videoEpisode.actors.contains("Actor 1"))
assertTrue(openGraphData.videoEpisode.actors.contains("Actor 2"))
assertEquals(1, openGraphData.videoEpisode.director.size)
assertTrue(openGraphData.videoEpisode.director.contains("Director 1"))
assertEquals(2, openGraphData.videoEpisode.writer.size)
assertTrue(openGraphData.videoEpisode.writer.contains("Writer 1"))
assertTrue(openGraphData.videoEpisode.writer.contains("Writer 2"))
assertEquals(45, openGraphData.videoEpisode.duration)
assertNotNull(openGraphData.videoEpisode.releaseDate)
assertEquals(OffsetDateTime.parse("2023-05-15T00:00:00Z"), openGraphData.videoEpisode.releaseDate)
assertEquals(2, openGraphData.videoEpisode.tags.size)
assertTrue(openGraphData.videoEpisode.tags.contains("drama"))
assertTrue(openGraphData.videoEpisode.tags.contains("comedy"))
assertEquals("Awesome Series", openGraphData.videoEpisode.series)
}
@Test
fun `test Gender enum toString method`() {
// Test that the toString method returns the lowercase name of the enum value
assertEquals("male", Gender.MALE.toString())
assertEquals("female", Gender.FEMALE.toString())
// Test that the fromString method correctly converts a string to the enum value
assertEquals(Gender.MALE, Gender.fromString("MALE"))
assertEquals(Gender.FEMALE, Gender.fromString("FEMALE"))
}
}

1
scrape-test/.gitignore vendored Normal file
View File

@@ -0,0 +1 @@
data/web

20
scrape-test/README.md Normal file
View File

@@ -0,0 +1,20 @@
#Scrape test module
The scrape test module is intended to test the immplementation of the library at scale by parsing a large amount of webpages and checking the quality of its results
## Data
At this moment
* one dataset was found on [Kaggle](https://www.kaggle.com/datasets/hetulmehta/website-classification).
* another on [Moz](https://moz.com/top-500/download/?table=top500Domains) (Top 500 most visited websites).
I'd like a more varied set of data from different types of sources, and the current set mostly seem to contain homepages but it's surprisingly hard to find.
## Running the tests
For various reasons, I am not uploading the actual data of the various URLs. To run the analysis yourself:
1. Run `Scraper.kt` once, which will grab all the webpages and place them in the `data/web` folder.
2. Run `ParserTest.kt`, which will run the `Parser` on each of those web pages and check whether the tags can be extracted, and if the page is considered valid.

View File

@@ -0,0 +1,28 @@
plugins {
id("java")
kotlin("jvm")
}
group = "fr.lengrand"
version = "unspecified"
repositories {
mavenCentral()
}
dependencies {
testImplementation(platform("org.junit:junit-bom:5.14.0"))
testImplementation("org.junit.jupiter:junit-jupiter")
implementation(kotlin("stdlib-jdk8"))
implementation(project(":opengraphkt"))
implementation("io.ktor:ktor-client-core:3.3.1")
implementation("io.ktor:ktor-client-cio:3.4.1")
}
tasks.test {
useJUnitPlatform()
}
kotlin {
jvmToolchain(17)
}

501
scrape-test/data/top500.csv Normal file
View File

@@ -0,0 +1,501 @@
"Rank","Root Domain","Linking Root Domains","Domain Authority"
"1","www.google.com","15,236,114","100"
"2","www.blogger.com","31,311,113","100"
"3","youtube.com","24,336,912","100"
"4","linkedin.com","13,291,390","99"
"5","support.google.com","5,720,703","99"
"6","cloudflare.com","8,211,585","99"
"7","microsoft.com","5,593,547","99"
"8","apple.com","6,849,526","99"
"9","en.wikipedia.org","7,201,596","98"
"10","play.google.com","4,012,038","98"
"11","wordpress.org","12,511,154","98"
"12","docs.google.com","3,642,278","98"
"13","mozilla.org","2,593,193","98"
"14","maps.google.com","6,190,949","98"
"15","youtu.be","5,434,247","98"
"16","drive.google.com","2,681,591","97"
"17","bp.blogspot.com","18,327,022","97"
"18","sites.google.com","2,401,535","97"
"19","googleusercontent.com","3,994,187","97"
"20","accounts.google.com","2,557,208","97"
"21","t.me","1,826,000","97"
"22","europa.eu","2,437,683","97"
"23","plus.google.com","10,955,614","97"
"24","whatsapp.com","4,778,976","97"
"25","adobe.com","2,880,183","96"
"26","facebook.com","61,926,417","96"
"27","policies.google.com","3,521,103","96"
"28","uol.com.br","694,206","96"
"29","istockphoto.com","3,728,189","96"
"30","vimeo.com","3,628,948","96"
"31","vk.com","1,869,205","96"
"32","github.com","3,170,446","96"
"33","amazon.com","5,149,651","96"
"34","search.google.com","1,825,467","95"
"35","bbc.co.uk","1,750,633","95"
"36","google.de","1,083,507","95"
"37","live.com","1,022,973","95"
"38","gravatar.com","12,679,255","95"
"39","nih.gov","1,591,787","95"
"40","dan.com","4,340,736","95"
"41","files.wordpress.com","7,667,702","95"
"42","www.yahoo.com","1,307,493","95"
"43","cnn.com","1,672,093","95"
"44","dropbox.com","1,124,594","95"
"45","wikimedia.org","2,113,156","95"
"46","creativecommons.org","1,780,143","95"
"47","google.com.br","298,643","95"
"48","line.me","1,120,656","95"
"49","googleblog.com","4,497,927","95"
"50","opera.com","1,037,979","95"
"51","es.wikipedia.org","995,228","95"
"52","globo.com","468,129","95"
"53","brandbucket.com","11,171,565","95"
"54","myspace.com","1,364,994","95"
"55","slideshare.net","1,002,121","95"
"56","paypal.com","1,188,172","95"
"57","tiktok.com","1,499,229","95"
"58","netvibes.com","1,238,045","95"
"59","theguardian.com","1,607,812","95"
"60","who.int","2,039,611","95"
"61","goo.gl","5,175,255","95"
"62","medium.com","1,869,221","95"
"63","tools.google.com","1,854,489","95"
"64","draft.blogger.com","12,332,795","95"
"65","pt.wikipedia.org","426,145","95"
"66","fr.wikipedia.org","659,228","95"
"67","www.weebly.com","6,870,155","95"
"68","news.google.com","870,057","95"
"69","developers.google.com","1,170,712","95"
"70","w3.org","1,145,140","95"
"71","mail.google.com","691,883","95"
"72","gstatic.com","642,344","95"
"73","jimdofree.com","1,700,543","95"
"74","cpanel.net","2,172,574","95"
"75","imdb.com","1,561,359","95"
"76","wa.me","2,064,272","95"
"77","feedburner.com","1,792,625","95"
"78","enable-javascript.com","5,140,325","95"
"79","nytimes.com","2,218,148","95"
"80","workspace.google.com","774,353","95"
"81","ok.ru","378,557","95"
"82","google.es","480,733","95"
"83","dailymotion.com","1,132,411","95"
"84","afternic.com","2,480,391","94"
"85","bloomberg.com","887,696","94"
"86","amazon.de","569,270","94"
"87","photos.google.com","278,989","94"
"88","wiley.com","704,290","94"
"89","aliexpress.com","544,478","94"
"90","indiatimes.com","515,121","94"
"91","youronlinechoices.com","592,485","94"
"92","elpais.com","445,804","94"
"93","tinyurl.com","1,475,080","94"
"94","yadi.sk","158,812","94"
"95","spotify.com","1,828,736","94"
"96","huffpost.com","1,193,239","94"
"97","ru.wikipedia.org","378,068","94"
"98","google.fr","417,006","94"
"99","webmd.com","851,236","94"
"100","samsung.com","427,587","94"
"101","independent.co.uk","784,346","94"
"102","amazon.co.jp","884,321","94"
"103","get.google.com","626,795","94"
"104","amazon.co.uk","806,792","94"
"105","4shared.com","574,070","94"
"106","telegram.me","445,468","94"
"107","planalto.gov.br","126,833","94"
"108","businessinsider.com","878,711","94"
"109","ig.com.br","159,348","94"
"110","issuu.com","1,033,499","94"
"111","www.gov.br","187,184","94"
"112","wsj.com","999,835","94"
"113","hugedomains.com","16,362,198","94"
"114","picasaweb.google.com","619,826","94"
"115","usatoday.com","896,757","94"
"116","scribd.com","774,108","94"
"117","www.gov.uk","652,298","94"
"118","storage.googleapis.com","1,116,110","94"
"119","huffingtonpost.com","1,066,209","94"
"120","bbc.com","939,892","94"
"121","estadao.com.br","138,770","94"
"122","nature.com","690,832","94"
"123","mediafire.com","897,442","94"
"124","washingtonpost.com","1,194,795","94"
"125","forms.gle","966,378","94"
"126","namecheap.com","1,072,723","94"
"127","forbes.com","1,489,798","94"
"128","mirror.co.uk","428,559","94"
"129","soundcloud.com","1,918,274","94"
"130","fb.com","486,554","94"
"131","marketingplatform.google....","917,237","94"
"132","domainmarket.com","943,784","94"
"133","ytimg.com","1,070,336","94"
"134","terra.com.br","200,774","94"
"135","google.co.uk","590,081","94"
"136","shutterstock.com","563,596","94"
"137","dailymail.co.uk","1,133,056","94"
"138","reg.ru","540,012","94"
"139","t.co","2,196,874","94"
"140","cdc.gov","962,959","94"
"141","thesun.co.uk","430,774","94"
"142","wp.com","2,247,022","94"
"143","cnet.com","761,368","94"
"144","instagram.com","30,964,179","94"
"145","researchgate.net","797,083","94"
"146","google.it","443,661","94"
"147","fandom.com","659,960","94"
"148","office.com","756,429","94"
"149","list-manage.com","795,381","94"
"150","msn.com","1,086,862","94"
"151","un.org","649,321","94"
"152","de.wikipedia.org","682,828","94"
"153","ovh.com","678,504","94"
"154","mail.ru","481,079","94"
"155","bing.com","1,075,800","94"
"156","news.yahoo.com","720,761","94"
"157","myaccount.google.com","378,003","94"
"158","hatena.ne.jp","1,641,114","94"
"159","shopify.com","3,485,537","94"
"160","adssettings.google.com","482,210","94"
"161","bit.ly","5,047,889","94"
"162","reuters.com","971,280","94"
"163","booking.com","431,062","94"
"164","discord.com","507,193","94"
"165","buydomains.com","1,240,200","94"
"166","nasa.gov","707,621","94"
"167","aboutads.info","723,570","94"
"168","time.com","858,096","94"
"169","abril.com.br","279,992","94"
"170","change.org","509,706","94"
"171","nginx.org","1,177,240","94"
"172","twitter.com","61,414,860","94"
"173","www.wikipedia.org","553,224","94"
"174","archive.org","1,389,450","94"
"175","cbsnews.com","705,330","94"
"176","networkadvertising.org","707,285","94"
"177","telegraph.co.uk","1,014,460","94"
"178","pinterest.com","9,106,097","94"
"179","google.co.jp","648,233","94"
"180","pixabay.com","510,897","94"
"181","zendesk.com","542,253","93"
"182","cpanel.com","970,971","93"
"183","vistaprint.com","815,228","93"
"184","sky.com","251,300","93"
"185","windows.net","460,289","93"
"186","alicdn.com","572,952","93"
"187","google.ca","339,139","93"
"188","lemonde.fr","287,813","93"
"189","newyorker.com","453,295","93"
"190","webnode.page","516,598","93"
"191","surveymonkey.com","459,483","93"
"192","translate.google.com","297,180","93"
"193","calendar.google.com","227,821","93"
"194","amazonaws.com","419,795","93"
"195","academia.edu","426,259","93"
"196","apache.org","1,094,485","93"
"197","imageshack.us","707,362","93"
"198","akamaihd.net","599,670","93"
"199","nginx.com","1,090,331","93"
"200","discord.gg","467,826","93"
"201","thetimes.co.uk","435,853","93"
"202","search.yahoo.com","525,853","93"
"203","amazon.fr","269,771","93"
"204","yelp.com","987,170","93"
"205","berkeley.edu","510,730","93"
"206","google.ru","198,345","93"
"207","sedoparking.com","726,100","93"
"208","cbc.ca","512,969","93"
"209","unesco.org","372,647","93"
"210","ggpht.com","790,656","93"
"211","privacyshield.gov","426,999","93"
"212","www.over-blog.com","811,888","93"
"213","clarin.com","150,420","93"
"214","www.wix.com","2,809,873","93"
"215","whitehouse.gov","423,904","93"
"216","icann.org","646,968","93"
"217","gnu.org","597,621","93"
"218","yandex.ru","1,067,333","93"
"219","francetvinfo.fr","184,067","93"
"220","gmail.com","216,193","93"
"221","mozilla.com","208,426","93"
"222","ziddu.com","192,713","93"
"223","guardian.co.uk","597,395","93"
"224","twitch.tv","553,259","93"
"225","sedo.com","2,498,739","93"
"226","foxnews.com","600,891","93"
"227","rambler.ru","934,373","93"
"228","books.google.com","453,152","93"
"229","stanford.edu","672,405","93"
"230","wikihow.com","664,665","93"
"231","it.wikipedia.org","354,035","93"
"232","20minutos.es","183,667","93"
"233","sfgate.com","387,976","93"
"234","liveinternet.ru","358,214","93"
"235","ja.wikipedia.org","377,585","93"
"236","000webhost.com","243,969","93"
"237","espn.com","410,145","93"
"238","eventbrite.com","685,661","93"
"239","disney.com","269,415","93"
"240","statista.com","428,155","93"
"241","addthis.com","637,495","93"
"242","pinterest.fr","122,907","93"
"243","lavanguardia.com","173,195","93"
"244","vkontakte.ru","336,278","93"
"245","doubleclick.net","496,992","93"
"246","bp2.blogger.com","561,940","93"
"247","skype.com","437,720","93"
"248","sciencedaily.com","380,140","93"
"249","bloglovin.com","547,875","93"
"250","insider.com","407,525","93"
"251","pl.wikipedia.org","150,769","93"
"252","sputniknews.com","185,708","93"
"253","id.wikipedia.org","567,598","93"
"254","doi.org","619,455","93"
"255","nypost.com","457,559","93"
"256","elmundo.es","248,073","93"
"257","abcnews.go.com","596,975","93"
"258","ipv4.google.com","325,798","93"
"259","deezer.com","177,906","93"
"260","express.co.uk","393,591","93"
"261","detik.com","407,103","93"
"262","mystrikingly.com","424,979","93"
"263","rakuten.co.jp","672,132","93"
"264","amzn.to","1,050,344","93"
"265","arxiv.org","293,699","93"
"266","alibaba.com","403,447","93"
"267","fb.me","354,320","93"
"268","wikia.com","505,599","93"
"269","t-online.de","263,484","93"
"270","telegra.ph","336,651","93"
"271","mega.nz","211,685","93"
"272","usnews.com","487,873","93"
"273","plos.org","342,614","93"
"274","naver.com","551,101","93"
"275","ibm.com","551,601","93"
"276","smh.com.au","339,840","93"
"277","dw.com","414,201","93"
"278","google.nl","276,116","93"
"279","lefigaro.fr","234,085","93"
"280","bp1.blogger.com","561,364","93"
"281","picasa.google.com","257,556","93"
"282","theatlantic.com","598,135","93"
"283","nydailynews.com","390,352","93"
"284","themeforest.net","545,669","93"
"285","rtve.es","200,359","93"
"286","newsweek.com","423,875","93"
"287","ovh.net","543,402","93"
"288","ca.gov","581,381","93"
"289","goodreads.com","954,326","93"
"290","economist.com","404,606","93"
"291","target.com","317,574","93"
"292","marca.com","126,447","93"
"293","kickstarter.com","504,234","93"
"294","hindustantimes.com","262,717","93"
"295","weibo.com","1,415,948","93"
"296","finance.yahoo.com","508,841","93"
"297","huawei.com","1,938,323","93"
"298","e-monsite.com","145,440","93"
"299","hubspot.com","385,555","93"
"300","npr.org","895,466","93"
"301","netflix.com","404,990","93"
"302","gizmodo.com","410,719","93"
"303","netlify.app","445,381","93"
"304","yandex.com","1,553,717","93"
"305","mashable.com","457,794","93"
"306","cnil.fr","251,699","93"
"307","latimes.com","748,209","93"
"308","steampowered.com","318,831","93"
"309","rt.com","313,229","93"
"310","photobucket.com","1,625,154","93"
"311","quora.com","495,634","93"
"312","nbcnews.com","722,611","93"
"313","android.com","323,946","93"
"314","instructables.com","364,549","93"
"315","www.canalblog.com","410,731","93"
"316","www.livejournal.com","3,021,508","93"
"317","ouest-france.fr","145,514","93"
"318","tripadvisor.com","784,126","93"
"319","ovhcloud.com","683,341","93"
"320","pexels.com","397,595","93"
"321","oracle.com","646,434","93"
"322","yahoo.co.jp","670,052","93"
"323","addtoany.com","794,722","93"
"324","sakura.ne.jp","413,375","93"
"325","cointernet.com.co","693,382","93"
"326","twimg.com","825,776","93"
"327","britannica.com","588,729","93"
"328","php.net","689,106","93"
"329","standard.co.uk","274,064","93"
"330","groups.google.com","419,267","93"
"331","cnbc.com","613,798","93"
"332","loc.gov","492,836","93"
"333","qq.com","4,380,702","93"
"334","buzzfeed.com","637,933","93"
"335","godaddy.com","2,529,806","93"
"336","ikea.com","384,319","93"
"337","disqus.com","928,440","93"
"338","taringa.net","151,257","93"
"339","ea.com","237,847","93"
"340","dropcatch.com","875,677","93"
"341","techcrunch.com","543,668","93"
"342","canva.com","362,582","93"
"343","offset.com","1,988,241","93"
"344","ebay.com","961,717","93"
"345","zoom.us","704,595","93"
"346","cambridge.org","397,605","93"
"347","unsplash.com","687,293","93"
"348","playstation.com","273,692","93"
"349","people.com","288,670","93"
"350","springer.com","573,622","93"
"351","psychologytoday.com","418,913","93"
"352","sendspace.com","151,427","93"
"353","home.pl","217,375","93"
"354","rapidshare.com","293,608","93"
"355","prezi.com","344,865","93"
"356","photos1.blogger.com","780,018","93"
"357","thenai.org","462,289","93"
"358","ftc.gov","314,810","93"
"359","google.pl","190,864","93"
"360","ted.com","657,104","93"
"361","secureserver.net","872,578","93"
"362","code.google.com","317,681","93"
"363","plesk.com","718,603","93"
"364","aol.com","670,116","93"
"365","biglobe.ne.jp","270,506","93"
"366","hp.com","504,078","93"
"367","canada.ca","330,831","93"
"368","linktr.ee","605,356","93"
"369","hollywoodreporter.com","319,018","93"
"370","ietf.org","414,873","93"
"371","clickbank.net","469,022","93"
"372","harvard.edu","826,745","93"
"373","amazon.es","204,999","93"
"374","oup.com","460,417","93"
"375","timeweb.ru","664,889","93"
"376","engadget.com","448,101","93"
"377","vice.com","427,956","93"
"378","cornell.edu","496,567","93"
"379","dreamstime.com","454,338","93"
"380","tmz.com","243,067","93"
"381","gofundme.com","371,903","93"
"382","pbs.org","564,342","93"
"383","stackoverflow.com","413,584","93"
"384","abc.net.au","429,595","93"
"385","sciencedirect.com","754,026","93"
"386","ft.com","528,414","93"
"387","variety.com","333,021","93"
"388","alexa.com","307,499","93"
"389","abc.es","213,186","93"
"390","walmart.com","390,872","93"
"391","gooyaabitemplates.com","600,604","93"
"392","redbull.com","175,031","93"
"393","ssl-images-amazon.com","587,621","93"
"394","theverge.com","439,537","93"
"395","spiegel.de","374,585","93"
"396","about.com","732,285","93"
"397","nationalgeographic.com","653,291","93"
"398","bandcamp.com","648,189","93"
"399","m.wikipedia.org","353,593","93"
"400","zippyshare.com","189,307","93"
"401","wired.com","713,599","93"
"402","freepik.com","386,749","93"
"403","outlook.com","427,062","93"
"404","mit.edu","757,903","93"
"405","sapo.pt","240,677","93"
"406","goo.ne.jp","332,616","92"
"407","java.com","151,581","92"
"408","google.co.th","120,992","92"
"409","scmp.com","204,983","92"
"410","mayoclinic.org","464,667","92"
"411","scholastic.com","200,881","92"
"412","nba.com","253,654","92"
"413","reverbnation.com","227,313","92"
"414","depositfiles.com","143,553","92"
"415","video.google.com","213,021","92"
"416","howstuffworks.com","324,205","92"
"417","cbslocal.com","310,532","92"
"418","merriam-webster.com","348,548","92"
"419","focus.de","167,487","92"
"420","admin.ch","232,963","92"
"421","gfycat.com","161,812","92"
"422","com.com","246,021","92"
"423","narod.ru","272,108","92"
"424","boston.com","327,894","92"
"425","sony.com","176,593","92"
"426","justjared.com","124,409","92"
"427","bitly.com","371,241","92"
"428","jstor.org","275,031","92"
"429","amebaownd.com","192,731","92"
"430","g.co","188,465","92"
"431","gsmarena.com","137,657","92"
"432","lexpress.fr","123,852","92"
"433","reddit.com","7,039,676","92"
"434","usgs.gov","255,999","92"
"435","bigcommerce.com","355,749","92"
"436","gettyimages.com","404,317","92"
"437","ign.com","331,808","92"
"438","justgiving.com","162,197","92"
"439","techradar.com","212,371","92"
"440","weather.com","233,771","92"
"441","amazon.ca","256,117","92"
"442","justice.gov","233,706","92"
"443","sciencemag.org","317,941","92"
"444","pcmag.com","317,511","92"
"445","theconversation.com","373,863","92"
"446","foursquare.com","274,397","92"
"447","flickr.com","9,156,536","92"
"448","giphy.com","471,107","92"
"449","tvtropes.org","143,379","92"
"450","fifa.com","201,899","92"
"451","upenn.edu","339,397","92"
"452","digg.com","816,971","92"
"453","bestfreecams.club","394,385","92"
"454","histats.com","452,459","92"
"455","salesforce.com","256,815","92"
"456","blog.google","157,787","92"
"457","apnews.com","331,978","92"
"458","theglobeandmail.com","275,289","92"
"459","m.me","268,002","92"
"460","europapress.es","120,539","92"
"461","washington.edu","390,985","92"
"462","thefreedictionary.com","269,796","92"
"463","jhu.edu","263,019","92"
"464","euronews.com","220,805","92"
"465","liberation.fr","127,411","92"
"466","ads.google.com","167,381","92"
"467","trustpilot.com","528,385","92"
"468","google.com.tw","148,361","92"
"469","softonic.com","163,143","92"
"470","kakao.com","173,573","92"
"471","storage.canalblog.com","320,126","92"
"472","interia.pl","163,015","92"
"473","metro.co.uk","286,308","92"
"474","viglink.com","398,156","92"
"475","last.fm","444,132","92"
"476","blackberry.com","145,035","92"
"477","public-api.wordpress.com","188,788","92"
"478","sina.com.cn","993,710","92"
"479","unicef.org","222,033","92"
"480","archives.gov","286,115","92"
"481","nps.gov","392,668","92"
"482","utexas.edu","291,645","92"
"483","biblegateway.com","289,507","92"
"484","usda.gov","446,386","92"
"485","indiegogo.com","276,002","92"
"486","nikkei.com","262,714","92"
"487","radiofrance.fr","137,232","92"
"488","repubblica.it","216,052","92"
"489","substack.com","272,562","92"
"490","ap.org","195,696","92"
"491","nicovideo.jp","165,700","92"
"492","joomla.org","224,141","92"
"493","news.com.au","300,915","92"
"494","allaboutcookies.org","477,121","92"
"495","mailchimp.com","421,675","92"
"496","stores.jp","440,302","92"
"497","intel.com","268,542","92"
"498","bp0.blogger.com","561,170","92"
"499","box.com","288,327","92"
"500","nhk.or.jp","256,113","92"
1 Rank Root Domain Linking Root Domains Domain Authority
2 1 www.google.com 15,236,114 100
3 2 www.blogger.com 31,311,113 100
4 3 youtube.com 24,336,912 100
5 4 linkedin.com 13,291,390 99
6 5 support.google.com 5,720,703 99
7 6 cloudflare.com 8,211,585 99
8 7 microsoft.com 5,593,547 99
9 8 apple.com 6,849,526 99
10 9 en.wikipedia.org 7,201,596 98
11 10 play.google.com 4,012,038 98
12 11 wordpress.org 12,511,154 98
13 12 docs.google.com 3,642,278 98
14 13 mozilla.org 2,593,193 98
15 14 maps.google.com 6,190,949 98
16 15 youtu.be 5,434,247 98
17 16 drive.google.com 2,681,591 97
18 17 bp.blogspot.com 18,327,022 97
19 18 sites.google.com 2,401,535 97
20 19 googleusercontent.com 3,994,187 97
21 20 accounts.google.com 2,557,208 97
22 21 t.me 1,826,000 97
23 22 europa.eu 2,437,683 97
24 23 plus.google.com 10,955,614 97
25 24 whatsapp.com 4,778,976 97
26 25 adobe.com 2,880,183 96
27 26 facebook.com 61,926,417 96
28 27 policies.google.com 3,521,103 96
29 28 uol.com.br 694,206 96
30 29 istockphoto.com 3,728,189 96
31 30 vimeo.com 3,628,948 96
32 31 vk.com 1,869,205 96
33 32 github.com 3,170,446 96
34 33 amazon.com 5,149,651 96
35 34 search.google.com 1,825,467 95
36 35 bbc.co.uk 1,750,633 95
37 36 google.de 1,083,507 95
38 37 live.com 1,022,973 95
39 38 gravatar.com 12,679,255 95
40 39 nih.gov 1,591,787 95
41 40 dan.com 4,340,736 95
42 41 files.wordpress.com 7,667,702 95
43 42 www.yahoo.com 1,307,493 95
44 43 cnn.com 1,672,093 95
45 44 dropbox.com 1,124,594 95
46 45 wikimedia.org 2,113,156 95
47 46 creativecommons.org 1,780,143 95
48 47 google.com.br 298,643 95
49 48 line.me 1,120,656 95
50 49 googleblog.com 4,497,927 95
51 50 opera.com 1,037,979 95
52 51 es.wikipedia.org 995,228 95
53 52 globo.com 468,129 95
54 53 brandbucket.com 11,171,565 95
55 54 myspace.com 1,364,994 95
56 55 slideshare.net 1,002,121 95
57 56 paypal.com 1,188,172 95
58 57 tiktok.com 1,499,229 95
59 58 netvibes.com 1,238,045 95
60 59 theguardian.com 1,607,812 95
61 60 who.int 2,039,611 95
62 61 goo.gl 5,175,255 95
63 62 medium.com 1,869,221 95
64 63 tools.google.com 1,854,489 95
65 64 draft.blogger.com 12,332,795 95
66 65 pt.wikipedia.org 426,145 95
67 66 fr.wikipedia.org 659,228 95
68 67 www.weebly.com 6,870,155 95
69 68 news.google.com 870,057 95
70 69 developers.google.com 1,170,712 95
71 70 w3.org 1,145,140 95
72 71 mail.google.com 691,883 95
73 72 gstatic.com 642,344 95
74 73 jimdofree.com 1,700,543 95
75 74 cpanel.net 2,172,574 95
76 75 imdb.com 1,561,359 95
77 76 wa.me 2,064,272 95
78 77 feedburner.com 1,792,625 95
79 78 enable-javascript.com 5,140,325 95
80 79 nytimes.com 2,218,148 95
81 80 workspace.google.com 774,353 95
82 81 ok.ru 378,557 95
83 82 google.es 480,733 95
84 83 dailymotion.com 1,132,411 95
85 84 afternic.com 2,480,391 94
86 85 bloomberg.com 887,696 94
87 86 amazon.de 569,270 94
88 87 photos.google.com 278,989 94
89 88 wiley.com 704,290 94
90 89 aliexpress.com 544,478 94
91 90 indiatimes.com 515,121 94
92 91 youronlinechoices.com 592,485 94
93 92 elpais.com 445,804 94
94 93 tinyurl.com 1,475,080 94
95 94 yadi.sk 158,812 94
96 95 spotify.com 1,828,736 94
97 96 huffpost.com 1,193,239 94
98 97 ru.wikipedia.org 378,068 94
99 98 google.fr 417,006 94
100 99 webmd.com 851,236 94
101 100 samsung.com 427,587 94
102 101 independent.co.uk 784,346 94
103 102 amazon.co.jp 884,321 94
104 103 get.google.com 626,795 94
105 104 amazon.co.uk 806,792 94
106 105 4shared.com 574,070 94
107 106 telegram.me 445,468 94
108 107 planalto.gov.br 126,833 94
109 108 businessinsider.com 878,711 94
110 109 ig.com.br 159,348 94
111 110 issuu.com 1,033,499 94
112 111 www.gov.br 187,184 94
113 112 wsj.com 999,835 94
114 113 hugedomains.com 16,362,198 94
115 114 picasaweb.google.com 619,826 94
116 115 usatoday.com 896,757 94
117 116 scribd.com 774,108 94
118 117 www.gov.uk 652,298 94
119 118 storage.googleapis.com 1,116,110 94
120 119 huffingtonpost.com 1,066,209 94
121 120 bbc.com 939,892 94
122 121 estadao.com.br 138,770 94
123 122 nature.com 690,832 94
124 123 mediafire.com 897,442 94
125 124 washingtonpost.com 1,194,795 94
126 125 forms.gle 966,378 94
127 126 namecheap.com 1,072,723 94
128 127 forbes.com 1,489,798 94
129 128 mirror.co.uk 428,559 94
130 129 soundcloud.com 1,918,274 94
131 130 fb.com 486,554 94
132 131 marketingplatform.google.... 917,237 94
133 132 domainmarket.com 943,784 94
134 133 ytimg.com 1,070,336 94
135 134 terra.com.br 200,774 94
136 135 google.co.uk 590,081 94
137 136 shutterstock.com 563,596 94
138 137 dailymail.co.uk 1,133,056 94
139 138 reg.ru 540,012 94
140 139 t.co 2,196,874 94
141 140 cdc.gov 962,959 94
142 141 thesun.co.uk 430,774 94
143 142 wp.com 2,247,022 94
144 143 cnet.com 761,368 94
145 144 instagram.com 30,964,179 94
146 145 researchgate.net 797,083 94
147 146 google.it 443,661 94
148 147 fandom.com 659,960 94
149 148 office.com 756,429 94
150 149 list-manage.com 795,381 94
151 150 msn.com 1,086,862 94
152 151 un.org 649,321 94
153 152 de.wikipedia.org 682,828 94
154 153 ovh.com 678,504 94
155 154 mail.ru 481,079 94
156 155 bing.com 1,075,800 94
157 156 news.yahoo.com 720,761 94
158 157 myaccount.google.com 378,003 94
159 158 hatena.ne.jp 1,641,114 94
160 159 shopify.com 3,485,537 94
161 160 adssettings.google.com 482,210 94
162 161 bit.ly 5,047,889 94
163 162 reuters.com 971,280 94
164 163 booking.com 431,062 94
165 164 discord.com 507,193 94
166 165 buydomains.com 1,240,200 94
167 166 nasa.gov 707,621 94
168 167 aboutads.info 723,570 94
169 168 time.com 858,096 94
170 169 abril.com.br 279,992 94
171 170 change.org 509,706 94
172 171 nginx.org 1,177,240 94
173 172 twitter.com 61,414,860 94
174 173 www.wikipedia.org 553,224 94
175 174 archive.org 1,389,450 94
176 175 cbsnews.com 705,330 94
177 176 networkadvertising.org 707,285 94
178 177 telegraph.co.uk 1,014,460 94
179 178 pinterest.com 9,106,097 94
180 179 google.co.jp 648,233 94
181 180 pixabay.com 510,897 94
182 181 zendesk.com 542,253 93
183 182 cpanel.com 970,971 93
184 183 vistaprint.com 815,228 93
185 184 sky.com 251,300 93
186 185 windows.net 460,289 93
187 186 alicdn.com 572,952 93
188 187 google.ca 339,139 93
189 188 lemonde.fr 287,813 93
190 189 newyorker.com 453,295 93
191 190 webnode.page 516,598 93
192 191 surveymonkey.com 459,483 93
193 192 translate.google.com 297,180 93
194 193 calendar.google.com 227,821 93
195 194 amazonaws.com 419,795 93
196 195 academia.edu 426,259 93
197 196 apache.org 1,094,485 93
198 197 imageshack.us 707,362 93
199 198 akamaihd.net 599,670 93
200 199 nginx.com 1,090,331 93
201 200 discord.gg 467,826 93
202 201 thetimes.co.uk 435,853 93
203 202 search.yahoo.com 525,853 93
204 203 amazon.fr 269,771 93
205 204 yelp.com 987,170 93
206 205 berkeley.edu 510,730 93
207 206 google.ru 198,345 93
208 207 sedoparking.com 726,100 93
209 208 cbc.ca 512,969 93
210 209 unesco.org 372,647 93
211 210 ggpht.com 790,656 93
212 211 privacyshield.gov 426,999 93
213 212 www.over-blog.com 811,888 93
214 213 clarin.com 150,420 93
215 214 www.wix.com 2,809,873 93
216 215 whitehouse.gov 423,904 93
217 216 icann.org 646,968 93
218 217 gnu.org 597,621 93
219 218 yandex.ru 1,067,333 93
220 219 francetvinfo.fr 184,067 93
221 220 gmail.com 216,193 93
222 221 mozilla.com 208,426 93
223 222 ziddu.com 192,713 93
224 223 guardian.co.uk 597,395 93
225 224 twitch.tv 553,259 93
226 225 sedo.com 2,498,739 93
227 226 foxnews.com 600,891 93
228 227 rambler.ru 934,373 93
229 228 books.google.com 453,152 93
230 229 stanford.edu 672,405 93
231 230 wikihow.com 664,665 93
232 231 it.wikipedia.org 354,035 93
233 232 20minutos.es 183,667 93
234 233 sfgate.com 387,976 93
235 234 liveinternet.ru 358,214 93
236 235 ja.wikipedia.org 377,585 93
237 236 000webhost.com 243,969 93
238 237 espn.com 410,145 93
239 238 eventbrite.com 685,661 93
240 239 disney.com 269,415 93
241 240 statista.com 428,155 93
242 241 addthis.com 637,495 93
243 242 pinterest.fr 122,907 93
244 243 lavanguardia.com 173,195 93
245 244 vkontakte.ru 336,278 93
246 245 doubleclick.net 496,992 93
247 246 bp2.blogger.com 561,940 93
248 247 skype.com 437,720 93
249 248 sciencedaily.com 380,140 93
250 249 bloglovin.com 547,875 93
251 250 insider.com 407,525 93
252 251 pl.wikipedia.org 150,769 93
253 252 sputniknews.com 185,708 93
254 253 id.wikipedia.org 567,598 93
255 254 doi.org 619,455 93
256 255 nypost.com 457,559 93
257 256 elmundo.es 248,073 93
258 257 abcnews.go.com 596,975 93
259 258 ipv4.google.com 325,798 93
260 259 deezer.com 177,906 93
261 260 express.co.uk 393,591 93
262 261 detik.com 407,103 93
263 262 mystrikingly.com 424,979 93
264 263 rakuten.co.jp 672,132 93
265 264 amzn.to 1,050,344 93
266 265 arxiv.org 293,699 93
267 266 alibaba.com 403,447 93
268 267 fb.me 354,320 93
269 268 wikia.com 505,599 93
270 269 t-online.de 263,484 93
271 270 telegra.ph 336,651 93
272 271 mega.nz 211,685 93
273 272 usnews.com 487,873 93
274 273 plos.org 342,614 93
275 274 naver.com 551,101 93
276 275 ibm.com 551,601 93
277 276 smh.com.au 339,840 93
278 277 dw.com 414,201 93
279 278 google.nl 276,116 93
280 279 lefigaro.fr 234,085 93
281 280 bp1.blogger.com 561,364 93
282 281 picasa.google.com 257,556 93
283 282 theatlantic.com 598,135 93
284 283 nydailynews.com 390,352 93
285 284 themeforest.net 545,669 93
286 285 rtve.es 200,359 93
287 286 newsweek.com 423,875 93
288 287 ovh.net 543,402 93
289 288 ca.gov 581,381 93
290 289 goodreads.com 954,326 93
291 290 economist.com 404,606 93
292 291 target.com 317,574 93
293 292 marca.com 126,447 93
294 293 kickstarter.com 504,234 93
295 294 hindustantimes.com 262,717 93
296 295 weibo.com 1,415,948 93
297 296 finance.yahoo.com 508,841 93
298 297 huawei.com 1,938,323 93
299 298 e-monsite.com 145,440 93
300 299 hubspot.com 385,555 93
301 300 npr.org 895,466 93
302 301 netflix.com 404,990 93
303 302 gizmodo.com 410,719 93
304 303 netlify.app 445,381 93
305 304 yandex.com 1,553,717 93
306 305 mashable.com 457,794 93
307 306 cnil.fr 251,699 93
308 307 latimes.com 748,209 93
309 308 steampowered.com 318,831 93
310 309 rt.com 313,229 93
311 310 photobucket.com 1,625,154 93
312 311 quora.com 495,634 93
313 312 nbcnews.com 722,611 93
314 313 android.com 323,946 93
315 314 instructables.com 364,549 93
316 315 www.canalblog.com 410,731 93
317 316 www.livejournal.com 3,021,508 93
318 317 ouest-france.fr 145,514 93
319 318 tripadvisor.com 784,126 93
320 319 ovhcloud.com 683,341 93
321 320 pexels.com 397,595 93
322 321 oracle.com 646,434 93
323 322 yahoo.co.jp 670,052 93
324 323 addtoany.com 794,722 93
325 324 sakura.ne.jp 413,375 93
326 325 cointernet.com.co 693,382 93
327 326 twimg.com 825,776 93
328 327 britannica.com 588,729 93
329 328 php.net 689,106 93
330 329 standard.co.uk 274,064 93
331 330 groups.google.com 419,267 93
332 331 cnbc.com 613,798 93
333 332 loc.gov 492,836 93
334 333 qq.com 4,380,702 93
335 334 buzzfeed.com 637,933 93
336 335 godaddy.com 2,529,806 93
337 336 ikea.com 384,319 93
338 337 disqus.com 928,440 93
339 338 taringa.net 151,257 93
340 339 ea.com 237,847 93
341 340 dropcatch.com 875,677 93
342 341 techcrunch.com 543,668 93
343 342 canva.com 362,582 93
344 343 offset.com 1,988,241 93
345 344 ebay.com 961,717 93
346 345 zoom.us 704,595 93
347 346 cambridge.org 397,605 93
348 347 unsplash.com 687,293 93
349 348 playstation.com 273,692 93
350 349 people.com 288,670 93
351 350 springer.com 573,622 93
352 351 psychologytoday.com 418,913 93
353 352 sendspace.com 151,427 93
354 353 home.pl 217,375 93
355 354 rapidshare.com 293,608 93
356 355 prezi.com 344,865 93
357 356 photos1.blogger.com 780,018 93
358 357 thenai.org 462,289 93
359 358 ftc.gov 314,810 93
360 359 google.pl 190,864 93
361 360 ted.com 657,104 93
362 361 secureserver.net 872,578 93
363 362 code.google.com 317,681 93
364 363 plesk.com 718,603 93
365 364 aol.com 670,116 93
366 365 biglobe.ne.jp 270,506 93
367 366 hp.com 504,078 93
368 367 canada.ca 330,831 93
369 368 linktr.ee 605,356 93
370 369 hollywoodreporter.com 319,018 93
371 370 ietf.org 414,873 93
372 371 clickbank.net 469,022 93
373 372 harvard.edu 826,745 93
374 373 amazon.es 204,999 93
375 374 oup.com 460,417 93
376 375 timeweb.ru 664,889 93
377 376 engadget.com 448,101 93
378 377 vice.com 427,956 93
379 378 cornell.edu 496,567 93
380 379 dreamstime.com 454,338 93
381 380 tmz.com 243,067 93
382 381 gofundme.com 371,903 93
383 382 pbs.org 564,342 93
384 383 stackoverflow.com 413,584 93
385 384 abc.net.au 429,595 93
386 385 sciencedirect.com 754,026 93
387 386 ft.com 528,414 93
388 387 variety.com 333,021 93
389 388 alexa.com 307,499 93
390 389 abc.es 213,186 93
391 390 walmart.com 390,872 93
392 391 gooyaabitemplates.com 600,604 93
393 392 redbull.com 175,031 93
394 393 ssl-images-amazon.com 587,621 93
395 394 theverge.com 439,537 93
396 395 spiegel.de 374,585 93
397 396 about.com 732,285 93
398 397 nationalgeographic.com 653,291 93
399 398 bandcamp.com 648,189 93
400 399 m.wikipedia.org 353,593 93
401 400 zippyshare.com 189,307 93
402 401 wired.com 713,599 93
403 402 freepik.com 386,749 93
404 403 outlook.com 427,062 93
405 404 mit.edu 757,903 93
406 405 sapo.pt 240,677 93
407 406 goo.ne.jp 332,616 92
408 407 java.com 151,581 92
409 408 google.co.th 120,992 92
410 409 scmp.com 204,983 92
411 410 mayoclinic.org 464,667 92
412 411 scholastic.com 200,881 92
413 412 nba.com 253,654 92
414 413 reverbnation.com 227,313 92
415 414 depositfiles.com 143,553 92
416 415 video.google.com 213,021 92
417 416 howstuffworks.com 324,205 92
418 417 cbslocal.com 310,532 92
419 418 merriam-webster.com 348,548 92
420 419 focus.de 167,487 92
421 420 admin.ch 232,963 92
422 421 gfycat.com 161,812 92
423 422 com.com 246,021 92
424 423 narod.ru 272,108 92
425 424 boston.com 327,894 92
426 425 sony.com 176,593 92
427 426 justjared.com 124,409 92
428 427 bitly.com 371,241 92
429 428 jstor.org 275,031 92
430 429 amebaownd.com 192,731 92
431 430 g.co 188,465 92
432 431 gsmarena.com 137,657 92
433 432 lexpress.fr 123,852 92
434 433 reddit.com 7,039,676 92
435 434 usgs.gov 255,999 92
436 435 bigcommerce.com 355,749 92
437 436 gettyimages.com 404,317 92
438 437 ign.com 331,808 92
439 438 justgiving.com 162,197 92
440 439 techradar.com 212,371 92
441 440 weather.com 233,771 92
442 441 amazon.ca 256,117 92
443 442 justice.gov 233,706 92
444 443 sciencemag.org 317,941 92
445 444 pcmag.com 317,511 92
446 445 theconversation.com 373,863 92
447 446 foursquare.com 274,397 92
448 447 flickr.com 9,156,536 92
449 448 giphy.com 471,107 92
450 449 tvtropes.org 143,379 92
451 450 fifa.com 201,899 92
452 451 upenn.edu 339,397 92
453 452 digg.com 816,971 92
454 453 bestfreecams.club 394,385 92
455 454 histats.com 452,459 92
456 455 salesforce.com 256,815 92
457 456 blog.google 157,787 92
458 457 apnews.com 331,978 92
459 458 theglobeandmail.com 275,289 92
460 459 m.me 268,002 92
461 460 europapress.es 120,539 92
462 461 washington.edu 390,985 92
463 462 thefreedictionary.com 269,796 92
464 463 jhu.edu 263,019 92
465 464 euronews.com 220,805 92
466 465 liberation.fr 127,411 92
467 466 ads.google.com 167,381 92
468 467 trustpilot.com 528,385 92
469 468 google.com.tw 148,361 92
470 469 softonic.com 163,143 92
471 470 kakao.com 173,573 92
472 471 storage.canalblog.com 320,126 92
473 472 interia.pl 163,015 92
474 473 metro.co.uk 286,308 92
475 474 viglink.com 398,156 92
476 475 last.fm 444,132 92
477 476 blackberry.com 145,035 92
478 477 public-api.wordpress.com 188,788 92
479 478 sina.com.cn 993,710 92
480 479 unicef.org 222,033 92
481 480 archives.gov 286,115 92
482 481 nps.gov 392,668 92
483 482 utexas.edu 291,645 92
484 483 biblegateway.com 289,507 92
485 484 usda.gov 446,386 92
486 485 indiegogo.com 276,002 92
487 486 nikkei.com 262,714 92
488 487 radiofrance.fr 137,232 92
489 488 repubblica.it 216,052 92
490 489 substack.com 272,562 92
491 490 ap.org 195,696 92
492 491 nicovideo.jp 165,700 92
493 492 joomla.org 224,141 92
494 493 news.com.au 300,915 92
495 494 allaboutcookies.org 477,121 92
496 495 mailchimp.com 421,675 92
497 496 stores.jp 440,302 92
498 497 intel.com 268,542 92
499 498 bp0.blogger.com 561,170 92
500 499 box.com 288,327 92
501 500 nhk.or.jp 256,113 92

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,42 @@
package fr.lengrand.scrape
import fr.lengrand.opengraphkt.Parser
import java.net.URI
import java.nio.file.Files
import java.nio.file.Paths
fun main() {
val parser = Parser()
var total = 0
var success = 0
var error = 0
var valid = 0
val websiteFolder = "./scrape-test/data/web"
val path = Paths.get(websiteFolder)
Files.walk(path)
.forEach {
println("filename: $it")
total++
try{
val openGraphData = parser.parse(URI("https://www.imdb.com/title/tt0068646/").toURL())
success++
if(openGraphData.isValid()) {
valid++
}
}catch (e: Exception) {
println("Error parsing URL: ${e.message}")
error++
}
}
println("Total: $total")
println("Success: $success")
println("Valid: $valid")
println("Error: $error")
}

View File

@@ -0,0 +1,146 @@
package fr.lengrand.scrape
import io.ktor.client.HttpClient
import io.ktor.client.engine.cio.CIO
import io.ktor.client.plugins.BrowserUserAgent
import io.ktor.client.plugins.HttpTimeout
import io.ktor.client.request.get
import io.ktor.client.statement.HttpResponse
import io.ktor.client.statement.bodyAsText
import kotlinx.coroutines.Dispatchers
import kotlinx.coroutines.joinAll
import kotlinx.coroutines.launch
import kotlinx.coroutines.withContext
import java.io.File
import java.util.concurrent.atomic.AtomicInteger
data class Website(
val id: Int,
val url: String,
val description: String,
val category: String,
)
// Process starts in main repo folder, not the module itself
val dataFile = "./scrape-test/data/website_classification.csv"
val siteDirectory = "./scrape-test/data/web"
/**
* Bulk webpage scraper using Ktor to efficiently download HTML from multiple URLs in parallel
* and save the content to individual files.
*/
@OptIn(kotlinx.coroutines.ExperimentalCoroutinesApi::class)
class WebScraper(
private val outputDirectory: String = "scrape-test/data/web",
private val concurrencyLevel: Int = 20,
private val requestTimeoutMillis: Long = 30000
){
private val client = HttpClient(CIO) {
install(HttpTimeout) {
requestTimeoutMillis = this@WebScraper.requestTimeoutMillis
}
BrowserUserAgent()
}
private val completedCount = AtomicInteger(0)
private val failedCount = AtomicInteger(0)
private val totalCount = AtomicInteger(0)
init {
File(outputDirectory).mkdirs()
}
/**
* Scrapes a single webpage and saves it to a file.
*
* @param url The URL to scrape
* @param outputFilePath The file path to save the HTML content
*/
private suspend fun scrapeWebpage(url: String, outputFilePath: String) {
try {
val response: HttpResponse = client.get(url)
val htmlContent = response.bodyAsText()
File(outputFilePath).writeText(htmlContent)
println("[${completedCount.incrementAndGet()}/${totalCount.get()}] Successfully scraped: $url")
} catch (e: Exception) {
println("[${failedCount.incrementAndGet()}/${totalCount.get()}] Failed to scrape $url: ${e.message}")
}
}
/**
* Scrapes multiple webpages in parallel and saves them to files.
*
* @param urls List of URLs to scrape
*/
suspend fun scrapeWebpages(urls: List<String>) {
totalCount.set(urls.size)
completedCount.set(0)
failedCount.set(0)
println("Starting to scrape ${urls.size} URLs with concurrency level: $concurrencyLevel")
// Create a coroutine dispatcher with a fixed thread pool
val dispatcher = Dispatchers.IO.limitedParallelism(concurrencyLevel)
withContext(dispatcher) {
urls.mapIndexed { index, url ->
val filename = sanitizeURls(url)
val outputPath = "$outputDirectory/$filename"
// Launch a coroutine for each URL
launch {
scrapeWebpage(url, outputPath)
}
}.joinAll() // Wait for all coroutines to complete
}
println("Scraping completed. Total: ${urls.size}, Successful: ${completedCount.get()}, Failed: ${failedCount.get()}")
}
/**
* Generates a safe filename from a URL.
*/
fun sanitizeURls(url: String): String {
val sanitizedUrl = url
.replace(Regex("^https?://"), "")
.replace(Regex("[^a-zA-Z0-9.-]"), "_")
return sanitizedUrl
}
/**
* Closes the HTTP client and releases resources.
*/
fun close() {
client.close()
}
}
suspend fun main(){
val stream = File(dataFile).inputStream()
val reader = stream.bufferedReader()
reader.readLine() // Skips headers
val websites = reader.lineSequence()
.map {
val (id, url, description, category) = it.split(",")
Website(
id.toInt(),
url,
description,
category
)
}.toList()
val urls = websites.map { it.url }
val scraper = WebScraper(
outputDirectory = siteDirectory,
concurrencyLevel = 20,
requestTimeoutMillis = 30000
)
try { scraper.scrapeWebpages(urls) } finally { scraper.close() }
}

View File

@@ -1,6 +1,6 @@
pluginManagement {
plugins {
kotlin("jvm") version "2.1.21"
kotlin("jvm") version "2.2.20"
}
}
plugins {
@@ -9,4 +9,5 @@ plugins {
rootProject.name = "OpenGraphKt"
include("opengraphkt")
include("demo")
include("demo-remote")
include("demo-remote")
include("scrape-test")