
    Ih                         d dl Z d dlZd dlmZ d dlmZ d dlmZ d dlmZ d dl	m
Z
mZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZmZ d dlmZ d dlmZ d dlmZ dZd Z	 	 	 	 	 ddZ y)    N)datetime)timezone)sync_to_async)urlparse)NetworkMonitoredPage
SourceHtml)logger)upload_html_json_to_ovh)handle_cookies)run_actions_on_page)process_selectorsraw_data_cleaner)get_pages_to_process)is_inactive)check_and_normalize_htmlFc                 r   t        | t        j                        rt        |       S t        | t              r| j                         S t        | t              r-| j                         D ci c]  \  }}|t        |       c}}S t        | t              r| D cg c]  }t        |       c}S | S c c}}w c c}w )N)

isinstanceuuidUUIDstrr   	isoformatdictitemsto_serializablelist)valkvxs       1/var/www/extractly/html_agregator/html_fetcher.pyr   r      s    #tyy!3x#x }}#t25))+>+$!Q?1%%+>>#t,/0Cq"C00J ?0s   )B.B4c                 J  K   |3t        |||       d {   } t        t              |       d {   }t        j                  dt        |              d\  }}|D ]  }	t        r't        d       t        j                  d       t        r't        j                  d|	j                          	 d }
|	j                  rJ t        t        j                  j                  |	j                        j                                d {   }
|
st        j                   d       t#        |	j                        }|j$                   d	|j&                   d
}||k7  s|sB| j)                  |d       d {    t+        | t-        |	di       xs i        d {    |d}}| j)                  |	j                  d       d {   }|r|j.                  nd }| j                  }g }|rYt1        |d      rM	 t1        |j2                  d      r|j2                  j4                  ng }|D cg c]  }|j                   c}|gz   }|
j8                  rt;        | |
j8                         d {    | j=                          d {   }t?        ||      \  }}}||	_         |rd|	_!        |	jD                  xs i |	_"        d|	jD                  d<   ||	jD                  d<   d|	_#        tI        jJ                         |	_&         t        |	jN                                d {    t        j                   d|        |
jP                  r tS        |	| |
jP                  |       d {    tI        jJ                         |	_&        |
jT                  xs g }tW        |||||      \  }}| |	_,        |r=tI        jJ                         |	_-        |	jD                  xs i |	_"        ||	jD                  d<   t]        |	jB                  xs |      }|d   |	_/        |d   |	_0        |d   |	_1        d|	_#         t        |	jN                                d {    te        |	jf                        |	j                  |	j@                  |d   |d   |d   ti        |	jD                        |	jL                  jk                         |	jl                  |	jn                  |	j                  |	jX                  |	jD                  r|	jD                  jq                  d      nd d}|	j                   d
|	jf                   d}ts        ||	jf                  |        y 7 7 7 -7 7 7 c c}w # t6        $ r |g}Y w xY w7 7 7 O7 7 ,# t6        $ r0}t        j                   d|	j                   d|        Y d }~;d }~ww xY ww)N)enablename
source_idsu)   [HTML_FETCH] Rekordów do przetworzenia: )NFu$   PAUZA – naciśnij 'p' aby wznowić   z[HTML_FETCH]  )	source_idu%   [HTML_FETCH] brak configu – pomijamz:///i$ )timeoutmetaTrequestredirect_chain)http_statuserror
html_errorhtml_error_reasonz"[HTML_FETCH] HTML error detected: )htmlinactive_configurlr,   	redirectsinactive_reasonraw_textimage_links
parse_data)idr2   r0   r5   r6   r7   r)   date_fetchedestate_type
offer_typesource	is_activer4   z.jsonz[HTML_FETCH] X z: ):r   r   r   r	   infolenpausedprinttimesleepr2   r&   r   objectsfilterfirstwarningr   schemenetlocgotor   getattrstatushasattrr*   r+   	Exceptionactionsr   contentr   r0   sliced_htmlr)   
is_fetchedr   nowr9   save	selectorsr   inactiveINACTIVEr=   inactive_dater   raw_datar6   r7   r   r8   r   r   r:   r;   getr
   )pagefilter_kwargspagesr"   r#   r$   qsbase_url
cookies_okobjhtml_cfgparsednew_baseresponser,   	final_urlr3   chainreqr0   is_err
err_reason	norm_htmlinactive_cfgr   reasoncleaned	json_data	file_nameexcs                                 r    fetch_and_save_html_for_pagesrr      s     }'
 
 *mD)"--
KK;CJ<HI&Hj89JJqM  	nSWWI./l	H}}"&&--"%-- . e" " 
 FG cgg&F --FMM?!<H8#:ii%i888$T73+C+IrJJJ'/* "YYswwY>>H-5(//4KIIGHi8, #8#3#35EF !((77 
 5: :ESE :i[ HI )$0@0@AAA'D -ETWb,c)FJ	 CH")88>r)-&0:,-!%#+<<> -mCHH-///!CJ<PQ !!'T83E3EtLLL'||~C $,,2L"* ,'##K !,OCM$,LLN!88>r.4*+ 's'>$?G":.CL%m4CO$\2CN!CN)-)+++ #&&kww#J/&}5%l3'1 # 0 0 : : <"!nn-- ]]FIhh388<<0A#BTXI  ==/366(%8I#IsvvyA_ 
 ." 9J ? !;  ,!*I, B' 0 M4 ,,  	NN_SWWIRu=>	s?  V#T.V#T1AV#"V#4AU'T4U'#V#$AU'2T73"U'T:(U'>T=?0U'02U"U 5U<%U'!U"U'9U:BU'UU',V#.'U'U!C"U'8U$9C1U'*V#1V#4U'7U':U'=U' UUU'UU'U'U'!U'$U''	V 0%VV#V  V#)NNtrueNN)!rB   r   r   django.utilsr   asgiref.syncr   urllib.parser   extractly.modelsr   r   link_agregator.utils.loggerr	   cloud_storage.servicesr
   link_agregator.utils.cookiesr   html_agregator.utils.actionsr   html_agregator.utils.selectorsr   r   html_agregator.utils.processr   $link_agregator.check_active.inactiver   rW    html_agregator.utils.error_guardr   r@   r   rr        r    <module>r      sS       ! & ! = . : 7 < N = H E		 
	Er   