Code Coverage for /home/runner/work/flymsg-be/flymsg-be/app/Http/Services/WebScraperService.php

	Code Coverage
	Lines			Functions and Methods				Classes and Traits
Total	94.92% covered (success)	94.92%	56 / 59	50.00% covered (danger)	50.00%	2 / 4	CRAP	0.00% covered (danger)	0.00%	0 / 1
WebScraperService	94.92% covered (success)	94.92%	56 / 59	50.00% covered (danger)	50.00%	2 / 4	19.05	0.00% covered (danger)	0.00%	0 / 1
scrapeCached	100.00% covered (success)	100.00%	8 / 8	100.00% covered (success)	100.00%	1 / 1	4
normalizeUrl	88.89% covered (warning)	88.89%	8 / 9	0.00% covered (danger)	0.00%	0 / 1	5.03
scrape	88.89% covered (warning)	88.89%	16 / 18	0.00% covered (danger)	0.00%	0 / 1	2.01
extractText	100.00% covered (success)	100.00%	24 / 24	100.00% covered (success)	100.00%	1 / 1	8

1	<?php
2
3	namespace App\Http\Services;
4
5	use GuzzleHttp\Client;
6	use Illuminate\Support\Facades\Cache;
7	use Illuminate\Support\Facades\Log;
8	use Symfony\Component\DomCrawler\Crawler;
9
10	/**
11	* Service for scraping website content and extracting meaningful text.
12	*
13	* Used by the roleplay auto-populate feature to extract company/product
14	* information from a given URL. Returns empty string on failure so the
15	* caller can fall back to AI grounding.
16	*/
17	class WebScraperService
18	{
19	/**
20	* Maximum number of characters to return from scraped content.
21	*/
22	private const MAX_CONTENT_LENGTH = 15000;
23
24	/**
25	* TTL (in seconds) for cached scrape results. 24 hours.
26	*/
27	private const CACHE_TTL_SECONDS = 86400;
28
29	/**
30	* Cache key prefix for scrape results.
31	*/
32	private const CACHE_KEY_PREFIX = 'roleplay:website_scrape:';
33
34	/**
35	* Scrape a website URL and extract meaningful text content.
36	*
37	* Fetches the HTML, removes non-content elements (scripts, styles, nav, footer),
38	* and extracts text from headings, paragraphs, and list items.
39	*
40	* Returns an empty string if the URL cannot be fetched or yields no content,
41	* allowing the caller to fall back to AI grounding.
42	*
43	* @param string $url The URL to scrape
44	* @return string The extracted text content (truncated to 15,000 chars), or empty string on failure
45	*/
46	/**
47	* Scrape a website URL with a 24-hour Redis cache.
48	*
49	* Wraps {@see self::scrape()}: on cache hit, returns the previously
50	* scraped content; on miss, scrapes and stores. Cache key is the
51	* sha1 of the normalized URL (lowercased host, no fragment, no
52	* trailing slash). Failed scrapes (empty result) are NOT cached so
53	* the next call may retry.
54	*
55	* @param string $url The URL to scrape
56	* @return string Scraped text content (max 15,000 chars), or empty string on failure
57	*/
58	public function scrapeCached(string $url): string
59	{
60	$key = self::CACHE_KEY_PREFIX.sha1($this->normalizeUrl($url));
61
62	$cached = Cache::get($key);
63	if (is_string($cached) && $cached !== '') {
64	return $cached;
65	}
66
67	$content = $this->scrape($url);
68
69	if ($content !== '') {
70	Cache::put($key, $content, self::CACHE_TTL_SECONDS);
71	}
72
73	return $content;
74	}
75
76	/**
77	* Normalize a URL for cache-key purposes.
78	*
79	* Lowercases the host, drops the fragment, and trims a trailing slash
80	* from the path. Query string is preserved (a different query yields
81	* a different cache entry).
82	*/
83	private function normalizeUrl(string $url): string
84	{
85	$parts = parse_url(trim($url));
86	if ($parts === false \|\| empty($parts['host'])) {
87	return strtolower(trim($url));
88	}
89
90	$scheme = strtolower($parts['scheme'] ?? 'https');
91	$host = strtolower($parts['host']);
92	$port = isset($parts['port']) ? ':'.$parts['port'] : '';
93	$path = rtrim($parts['path'] ?? '', '/');
94	$query = isset($parts['query']) ? '?'.$parts['query'] : '';
95
96	return $scheme.'://'.$host.$port.$path.$query;
97	}
98
99	public function scrape(string $url): string
100	{
101	try {
102	$client = new Client([
103	'timeout' => 15,
104	'connect_timeout' => 10,
105	'headers' => [
106	'User-Agent' => 'Mozilla/5.0 (compatible; FlyMSG/1.0)',
107	'Accept' => 'text/html,application/xhtml+xml',
108	],
109	'verify' => true,
110	]);
111
112	$response = $client->request('GET', $url);
113	$html = $response->getBody()->getContents();
114
115	return $this->extractText($html);
116	} catch (\Exception $e) {
117	Log::warning('[WebScraperService] Failed to scrape URL', [
118	'url' => $url,
119	'error' => $e->getMessage(),
120	]);
121
122	return '';
123	}
124	}
125
126	/**
127	* Extract meaningful text from raw HTML content.
128	*
129	* @param string $html Raw HTML content
130	* @return string Cleaned text content
131	*/
132	private function extractText(string $html): string
133	{
134	$crawler = new Crawler($html);
135
136	// Remove non-content elements
137	$crawler->filter('script, style, nav, footer, header, iframe, noscript, svg')->each(function ($node) {
138	$domNode = $node->getNode(0);
139	if ($domNode && $domNode->parentNode) {
140	$domNode->parentNode->removeChild($domNode);
141	}
142	});
143
144	$texts = [];
145
146	// Meta description
147	$metaDesc = $crawler->filter('meta[name="description"]');
148	if ($metaDesc->count() > 0) {
149	$content = $metaDesc->attr('content');
150	if ($content) {
151	$texts[] = $content;
152	}
153	}
154
155	// Page title
156	$title = $crawler->filter('title');
157	if ($title->count() > 0) {
158	$texts[] = $title->text('');
159	}
160
161	// Headings and body content
162	$crawler->filter('h1, h2, h3, p, li, td, blockquote')->each(function ($node) use (&$texts) {
163	$text = trim($node->text(''));
164	if (strlen($text) > 10) {
165	$texts[] = $text;
166	}
167	});
168
169	$cleanedText = implode("\n", array_filter($texts));
170
171	if (strlen($cleanedText) > self::MAX_CONTENT_LENGTH) {
172	$cleanedText = substr($cleanedText, 0, self::MAX_CONTENT_LENGTH);
173	}
174
175	return $cleanedText;
176	}
177	}