
    wiG,                        d dl Z d dlZd dlZd dlZd dlZd dlZd dlmZ d dl	m
Z
mZmZ d dlmZmZ d dlZd dlmZ d dlmZ d dlmZ  ej.                  e      Zh dZd	d
iZdZ ej:                  ddd      ZdZ e j@                  de jB                        Z"de#de$fdZ%de#de$fdZ&de#de#fdZ'de#de#fdZ(de#dee#   fdZ)de#de#dee#   fdZ*de
de#dee#   fdZ+de#de#fdZ,de#dee-e#f   fd Z.d!d"d#e#ddfd$Z/dee#   fd%Z0y)&    N)Iterable)AnyOptionalTuple)urljoinurlparse)transaction)timezone)upload_to_ovh>   	image/gif	image/png
image/jpeg
image/webpz	image/jpgr   i  @g      4@g      @)connectread)zno-photono_photoplaceholderz/og_image_main.jpgz/static-gh/brak_zdjeciazlogo.jpgz/gfx/logotypy/zM(?P<url>https?://[^\s"\'<>]+?\.(?:jpg|jpeg|png|webp|gif))(?:[?#][^\s"\'<>]*)?ureturnc                 \    | xs dj                         t        fdt        D              S )N c              3   &   K   | ]  }|v  
 y w)N ).0phlus     ,/var/www/extractly/image_agregator/images.py	<genexpr>z"_is_placeholder.<locals>.<genexpr>#   s     9"8BrRx"8s   )loweranyPLACEHOLDER_SUBSTRINGS)r   r   s    @r   _is_placeholderr#   !   s%    
'r	B9"8999    sc                     | xs dj                         } | j                  d      xr | j                  d      xs$ | j                  d      xr | j                  d      S )Nr   []{})strip
startswithendswith)r%   s    r   _looks_like_jsonr.   %   sJ    	
bALL1!**S/]q||C7H7\QZZX[_]r$   c                    | j                         j                  d      r'| j                         j                  d      sd| dd  z   S | j                         j                  d      r'| j                         j                  d      sd| dd  z   S | S )Nzhttps:/https://   zhttp:/http://   )r    r,   r   s    r   _fix_scheme_slashesr5   )   sw    wwyI&qwwy/C/CJ/OAabE!!wwyH%aggi.B.B9.M1QR5  Hr$   c                 d    | xs dj                         j                  d      j                         S )Nr   z'")r+   r4   s    r   _strip_quotesr7   1   s'    G??""5)//11r$   c                 \    t         j                  | xs d      }|r|j                  d      S y)uv   
    Jeśli w stringu jest „page URL” + cytowany URL obrazka,
    wyciągnij pierwszy dopasowany URL obrazka.
    r   urlN)
IMG_URL_REsearchgroup)r%   ms     r   _extract_img_url_from_textr>   5   s,    
 	!'r"Awwu~r$   rawbase_urlc                 :   | syt        |       }t        |      }|xs |}t        |      }|j                         }|j	                  d      s|j	                  d      rt        |      rdS |S |j	                  d      rt        |xs d|      }t        |      rdS |S y)u  
    Zwraca poprawny absolutny URL obrazka dla „dziwnych” stringów:
    - usuwa cudzysłowy,
    - naprawia https:/ -> https://,
    - wycina URL obrazka z tekstu, jeśli są śmieci dookoła,
    - stosuje urljoin TYLKO dla ścieżek względnych.
    Nr2   r0   /r   )r7   r>   r5   r    r,   r#   r   )r?   r@   r%   	extractedlowabsus         r   _normalize_candidate_strrF   ?   s     cA*1-IQA 	AA '')C
~~i CNN:$>&q)t0q0 	||Cx~2q)&t,t6$6 r$   c                 V   | syt        | t              rC| j                         }|syt        |      r	 t	        j
                  |      } nt        ||      }|S t        | t              r6dD ]1  }| j                  |      }|st        t        |      |      }|s/|c S  t        | t              rt        | t        t        t        f      s| D ]  }t        |t              rV|j                         rFt        |      r)	 t	        j
                  |      }t        ||      }|r|c S 	 t        ||      }|r|c S t        |t              szdD ]3  }|j                  |      }|st        t        |      |      }|s/|c c S   y# t        $ r t        ||      }|cY S w xY w# t        $ r Y w xY w)u"  
    Zwraca absolutny URL pierwszego sensownego obrazka.
    Obsługuje:
      - str (również „json w stringu”, np. '["https://..."]' albo '{"url":"..."}'),
      - dict: klucze url/src/href,
      - list/iterable: stringi lub dicty z ww. kluczami.
    Placeholdery są pomijane.
    N)r9   srchref)
isinstancestrr+   r.   jsonloads	ExceptionrF   dictgetCollIterablebytes_first_image_url)r?   r@   r%   candkvitemjs           r   rS   rS   _   s     #sIIKAjjm ,Ax8DK #t'A
A/AAK ( #|$Zc5$=O-PD$$#D) JJt,/8<#'K  
 0h?K$%/AA7AI#'K 0% 2 Y  /8<: % s#   E? 9$F?FF	F('F(ctc                     t         j                  | xs dj                         j                         | xs dj                         j                               } | dk(  ry| dk(  ry| dk(  ry| dk(  ry	t	        j
                  |       xs d
}|dv rdS |S )Nr   r   .jpgr   .pngr   .webpr   .gifz.bin)z.jpe)CONTENT_TYPE_FIXrP   r+   r    	mimetypesguess_extension)rY   exts     r   _ext_from_content_typerc      s    			rxR..066828:J:J:L:R:R:T	UB	\	[	\	[

#
#B
'
16CI%6.3.r$   r9   c                 N   t        |       }|j                  r(|j                  r|j                   d|j                   dnd }ddd}|r||d<   t        j                  t
        d|      5 }|j                  d	|       5 }|j                          |j                  j                  d
d      xs dj                  d      d   j                         j                         }t        j                  ||      }|t        vr|j                  xs dj                         }|j!                  d      rd}nN|j!                  d      rd}n:|j!                  d      rd}n&|j!                  d      rd}nt#        d|xs d       t%        j&                         }|j)                         D ]@  }	|	s|j+                  |	       |j-                         t.        kD  s/t#        dt.         d       |j1                         |fcd d d        cd d d        S # 1 sw Y   nxY w	 d d d        y # 1 sw Y   y xY w)Nz://rB   z_Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124 Safari/537.36zFimage/avif,image/webp,image/apng,image/svg+xml,image/*;q=0.8,*/*;q=0.5)z
User-AgentAcceptRefererT)timeoutfollow_redirectsheadersGETzcontent-typer   ;r   )r[   z.jpegr   r\   r   r]   r   r^   r   zUnsupported content-type: unknownzImage too large > z bytes)r   schemenetlochttpxClientHTTP_TIMEOUTstreamraise_for_statusri   rP   splitr+   r    r_   ALLOWED_MIMEpathr-   
ValueErrorioBytesIO
iter_byteswritetell	MAX_BYTESgetvalue)
r9   parsedrefererri   clientresprY   rv   bufchunks
             r   _safe_downloadr      s   c]F7=}}s6==/3\`GwZG $		lT7	SW]]]5#&$!!#,,"">26<"CCCHKQQSYY[B!%%b"-B%)r002==!23%B]]6*$B]]7+%B]]6*$B$'A"/	AR%STT**,C*		% 88:	)$'9)F%KLL + <<>2%7 '& 
T	S&&& 
T	S	Ss+   *H=EH%H2	HH	
HH$r   noter   c          	      f   	 d| _         t        | d      rd| _        t        | d      rbt        | d      xs i }|j	                  dg        |r7|d   j                  t        j                         j                         |d       || _	        dg}t        | d      r|j                  d       t        | d      r|j                  d       | j                  |       t        j                  dt        | d	d
      t        | dd
             y
# t        $ r }t        j                  d|       Y d
}~y
d
}~ww xY w)u  
    Oznacz ogłoszenie do sprawdzenia stanu (status-only):
      - check_active=True  -> trafi do get_flagged_pages(...)
      - check_active_from_image=True -> wiemy, że powodem była awaria obrazka
    Nic nie zmieniamy w is_active tutaj – zajmie się tym checker.
    Tcheck_active_from_imagemetaimage_failures)tsr   check_activeupdate_fieldsz5Flagged for status-check (image failure) id=%s url=%sidNr9   z<Failed to flag instance for status-check (image failure): %s)r   hasattrr   getattr
setdefaultappendr
   now	isoformatr   saveloggerinforN   warning)instancer   r   r   es        r   _flag_for_status_checkr      s.   Z $867/3H, 8V$8V,2DOO,b1%&--#<<>335tD !HM ((867  !:;8V$  (M2KWU]_ceiMjlst|  D  FJ  mK  	L ZUWXYYZs   DD 	D0D++D0c           
         t        | dd      xs d}t        | dd      }t        ||      }|s/t        j                  dt        | dd             t	        | d	       y	 t        |      \  }}t        j                  |      j                         dd }t        |      }| j                  j                  j                         }	t        | dd      }
t!        j"                         j%                  d      }d|	 d|
 d| d| | 	}	 t'        |||d      }|s0t        j)                  d|t        | dd             t	        | d	       yt+        j,                         5  || _        | j1                  dg       ddd       t        j                  dt        | dd      |       |S # t        $ rP}t        j                  d
|t        | dd      |       t	        | dt        |      j                   	       Y d}~yd}~ww xY w# t        $ rR}t        j)                  d|t        | dd      |d       t	        | dt        |      j                   	       Y d}~yd}~ww xY w# 1 sw Y   xY w)uf  
    Pobiera pierwszy obrazek z instance.original_image_urls, wysyła do OVH
    i zapisuje **string** (publiczny URL) w polu `images`.

    Jeśli nie uda się znaleźć/pobrać/wysłać zdjęcia:
      - oznaczamy ogłoszenie do sprawdzenia: check_active=True, check_active_from_image=True
      - NIE zmieniamy od razu is_active – zrobi to checker.
    r9   r   original_image_urlsN)r@   z&No image candidates for instance id=%sr   no_image_candidatesr   z(Download image failed for %s (id=%s): %szdownload_error:   rl   z%Y/%m/%dzimages/rB   z#public, max-age=31536000, immutable)content_typecache_controlz+Upload to OVH failed for key=%s (id=%s): %sT)exc_infozupload_error:z0OVH upload returned empty URL for key=%s (id=%s)upload_empty_urlimagesr   z!Stored main image for id=%s -> %s)r   rS   r   r   r   r   rN   r   type__name__hashlibsha256	hexdigestrc   	__class__r    r
   r   strftimer   errorr	   atomicr   r   )r   r@   	originals	first_urldatarY   r   sharb   clsobj_idymd
object_key
public_urls                 r   store_main_imager      sG    x+1rH"7>I X>I<ghPTVZ>[\x.CD!),b ..

(
(
*3B
/C
 
$C



%
%
+
+
-CXtY/F
,,.
!
!*
-C3%q#auSE:J
"?	

 GU\]egkmqUrsx.@A 
			$XJ/ 
 KK3WXtT5RT^_G  A9gV^`dfjNkmnoxQ@P@P?Q.RS&  BJPWX`bfhlPmop{  	Axd1g>N>N=O.PQ 
	s>   F G: I	G7'AG22G7:	IAIII!)1rerL   rx   r   r`   loggingcollections.abcr   rQ   typingr   r   r   urllib.parser   r   ro   	django.dbr	   django.utilsr
   cloud_storage.servicesr   	getLoggerr   r   ru   r_   r}   Timeoutrq   r"   compile
IGNORECASEr:   rK   boolr#   r.   r5   r7   r>   rF   rS   rc   rR   r   r   r   r   r$   r   <module>r      s   
  	    4 ' ' *  ! ! 0			8	$ F. 	u}}T3T:  RZZTMM

:s :t :^ ^ ^3 3 2S 2S 2# (3- #  # @@# @ @# @F/s /s /&& &&eSj 1 &&X 57 !Zc !Z4 !ZH6(3- 6r$   