
    :hU                         d dl Z d dlZd dlZd dlmZ d dlmZ d dlmZm	Z	m
Z
 d dlmZ d Zd Zd Zd	 Zd
 ZdedefdZddZdde	defdZy)    N)BeautifulSoup)now)NetworkSourceErrorNetworkMonitoredPageManualDataSource)send_alert_notificationc                 @    | j                  dd      }|dk(  ry|dk(  ryy )N	ifMissingnulltrueTfalseFget)configoptions     -/var/www/extractly/manual_agregator/parser.py_resolve_missing_boolr   	   s+    ZZV,F	7	    c                 P    | j                  d      dk(  r| j                  dd      S y )Nr
   defaultdefaultValue r   )r   s    r   _resolve_missing_textr      s'    zz+)+zz."--r   c                     | sy| j                         j                         j                  dd      }ddddddddddddddd}|j                         D ]  \  }}||v s|c S  y)	zAStandardizes and detects currency code (PLN, EUR, USD, GBP, etc).N r   PLNEURUSDGBP)u   złplnzlu   zł.eureurou   €usddolar$zus$gbpfunt   £)striplowerreplaceitems)label_or_texttextCURRENCY_MAPkvs        r   parse_currencyr3      s     &&(00b9D U%eEu5e5
L ""$19H % r   c                    | sy| j                  dd      j                  dd      } t        j                  d|       }d}d}d}|ru|j                  d      j                  dd	      }	 d|v rt	        |      n
t        |      }| |j                         d j                         }t        |      xs t        |       }|||fS # t        $ r d}Y Kw xY w)
z?Extracts number, label (suffix), and currency code from a text.)NNN    r   ,.z([\d\s]+(?:\.\d+)?)N   r   )
r,   researchgroupfloatint	Exceptionendr*   r3   )r/   matchnumberlabelcurrency
number_strs         r   extract_number_and_labelrE   ,   s    <<$,,S#6DII,d3EFEH[[^++C4
	*-*;U:&ZF UYY[\"((*!%(@N4,@5(""  	F	s   &B> >CCc                     | j                  dddi      D ]R  }|j                  d      }t        |      dk\  s#||d   j                  v s5|d   j                  j                         c S  y )	Ndivzdata-sentry-elementItemGridContainer)attrsp   r   r8   )find_alllenr/   r*   )souprB   rG   pss       r   extract_value_by_labelrP   A   se    }}U+@BU*V}W\\#r7a<ERUZZ/a5::##%% X r   inactive_rulesreturnc                    |D ]  }|j                  d      }|dk(  r?|j                  dd      j                         }|| j                  j                         v sW y|dk(  rk|j                  d      }|j                  dd      j                         }|j                  |      }|s|j	                  d      j                         |k(  s y|d	k(  rl|j                  d      }|j                  dd      j                         }|j                  |      }|s||j	                  d      j                         v s8 y|d
k(  r&|j                  d      }|j                  |      rc y|dk(  sl|j                  d      }	|j                  dd      j                         }
t        | |	d      }t        |t              rt        j                  |      }|s|
t        |      j                         v s y y)Ntypetext_containsr/   r   Tselector_textselectorr*   selector_containsselector_missingsource_field_matchfieldr@   F)r   r+   html
select_oneget_textgetattr
isinstancedictjsondumpsstr)pagerN   rQ   rule	rule_typer/   rW   expected_textelementr\   r@   values               r   _check_inactiverl   H   s   HHV$	'88FB'--/Dtyy((/)xx
+H HHVR0668Mooh/G7++$+7==?=P--xx
+H HHVR0668Mooh/G=G,<,<4,<,H,N,N,PP,,xx
+H??8,..HHW%EHHWb)//1ED%,E%&

5)#e*"2"2"44; > r   c                    t               }|r| j                  |      n| g}|D ]  }|j                  d      D ]9  }|j                  d      }|s|j	                  d      s)|j                  |       ; |j                  d      D ]t  }|j                  d      }|s|j                  d      D ]J  }|j                         j                  d      d   }|s(|j	                  d      s:|j                  |       L v  t        |      S )	u   
    Zwraca listę unikalnych linków do zdjęć (src z <img>, srcset z <source>) wg selektora lub domyślnie cała strona.
    imgsrchttpsourcesrcsetr6   r   r   )	setselectrL   r   
startswithaddsplitr*   list)	rN   rW   urlscontextscopern   ro   rr   urls	            r   extract_image_linksr}   n   s     5D'/dkk(#dVG>>%(C''%.Cs~~f- )
 >>(+CWWX&F!<<,C))+++C03Cs~~f5 -	 ,  :r   rf   c                   012 | j                   }	 |j                  }	 t        d       t        d| j                          t        d| j                  d d	  d
       t        | j                  d      }i }i }d }|j                  xs g }|D ]>  }	d|	v r|j                  |	d         }
|
s|
j                  d      j                         0|	j                  dg       }t!        |t"              r|g}t%        0fd|D              sy|	d   }t        d|	d    d|         nd|	v st'        | |	d   d      2t!        2t(              rt+        j,                  2      2n2t#        2      j                         2|	j                  dg       }t!        |t"              r|g}t%        2fd|D              s&|	d   }t        d|	d    d|         n |j.                  }t!        |t(              r|r
||v r||   }n|}t        d|xs d dt1        |j3                                       |j5                         D ]  \  }}|j                  d      s|j                  dd      }d }t        d| d|        |d k(  rA|j                  d!      }t7        ||      }|j                  d"d      }|d#k(  r]t9        |      \  }}}|j                  d$      r
|r|||d$   <   |j                  d%      r
|r|||d%   <   |}t        d&| d'| d(|        n|d)k(  r|xs dj;                         j                         1|j                  d*g d+      }|j                  d,g d-      }t%        1fd.|D              rd}n"t%        1fd/|D              rd}nt=        |      }t        d0| d'|        n|}t        d1| d'|        n|d#k(  r|j                  d      }|j                  |      }
|
r|
j                  d      n
t?        |      }t9        |      \  }}}|j                  d$      r
|r|||d$   <   |j                  d%      r
|r|||d%   <   |}t        d2| d'| d(|        nY|d)k(  r|j                  d      }|j                  |      }
|
r|
j                  d      n
t?        |      }|xs dj;                         j                         1|j                  d*g d+      }|j                  d,g d-      }t%        1fd3|D              rd}n"t%        1fd4|D              rd}nt=        |      }t        d5| d'|        n~|dk(  ry|j                  d      }|j                  |      }
|
r9|j                  d6d      r|
j                  d
d7      }n|
j                  d      }nt?        |      }t        d8| d'|        |||<   |||<    |j5                         D ]4  \  }}|j                  d      r|j                  dd      }d }t        d9| d|        |d:k(  r>|j                  d      }tA        ||      }t        d;| d<tC        |       d=       |||<   |d k(  rA|j                  d!      }t7        ||      }|j                  d"d      }|d#k(  r]t9        |      \  }}}|j                  d$      r
|r|||d$   <   |j                  d%      r
|r|||d%   <   |}t        d&| d'| d(|        n|d)k(  r|xs dj;                         j                         1|j                  d*g d+      }|j                  d,g d-      }t%        1fd>|D              rd}n"t%        1fd?|D              rd}nt=        |      }t        d0| d'|        nV|}t        d1| d'|        nA|d#k(  r|j                  d      }|j                  |      }
|
r|
j                  d      n
t?        |      }t9        |      \  }}}|j                  d$      r
|r|||d$   <   |j                  d%      r
|r|||d%   <   |}t        d2| d'| d(|        n|d)k(  r|j                  d      }|j                  |      }
|
r|
j                  d      n
t?        |      }|xs dj;                         j                         1|j                  d*g d+      }|j                  d,g d-      }t%        1fd@|D              rd}n"t%        1fdA|D              rd}nt=        |      }t        d5| d'|        ndB|v r|j                  dB      }|j                  |      }t        dC| d'|        |j                  dD      xs" |j                  |i       j                  dDdE      }|j                  dF      }|rv|t|jE                  |      D cg c]  }|j;                          } }dG|cxk  rtC        |       k  rn n| |   }t        dH| dI| d'|        nd }t        dH| dI| dJ       n|r|}nt?        |      }n|dk(  r{|j                  d      }|j                  |      }
|
r9|j                  d6d      r|
j                  d
d7      }n|
j                  d      }nt?        |      }t        d8| d'|        n9|dKk(  r|j                  d      }|j                  |      }
|
r|
j                  d      j                         0t%        0fdL|j                  d*g       D              rd}n2t%        0fdM|j                  d,g       D              rd}nt=        |      }t        dN| d'|        nt=        |      }t        dN| dO| dP       ne|dQk(  r|j                  d      }|j                  |      }
|
rx|
j                  d      j                         0|j                  dRi       }!d }"|!j5                         D ]  \  }#}$t%        0fdS|$D              s|#}" n |"}t        dT| d'|        nt?        |      }t        dT| dO| dP       n|dUk(  r|j                  dVg       }%d}&|%D ]/  }'|j                  |'      }
|
s|&dW|
j                  dWd7      z   z  }&1 |j                  dXg       D ]  }(|&jG                  |(d      }& |&j;                         }&|&r|&n
t?        |      }t        dY|% d'|        |r%|#t        dZ| d[| dP       tI        d\| d]|       |||<   7 t        d^| j                   d_       |j5                         D ]  \  })}*t        d`|) da|*         t'        |dbi       xs i j5                         D ]!  \  }+},tK        | |,      st'        | |,      ||+<   # |jL                  xs g }-tO        | ||-      rd| _(        tS               | _*        tC        |      }.tW        dc |jY                         D              }/|/|.|j5                         D )*cg c]  \  })}*|)ddk7  s|*d dg i fvs|) c}*})|j5                         D )*cg c]  \  })}*|)ddk7  s|*d dg i fv s|) c}*})|xs dde|dd<   t        df|dd           t+        j,                  |dg      | _-        || _.        d| _/        | ja                          t        dh| j                   d
d        y# t        j                  $ r3 t        j
                  j                  |dd       t        d|        Y yw xY wc c}w c c}*})w c c}*})w # tb        $ r t        di| j                   d
te        jf                                 t        j
                  j                  |te        jf                         dj       ti        dk| j                   dl|jj                   dmte        jf                          dn|jj                   o       Y yw xY w)pNu)   Brak ManualDataSource dla tego źródła.MissingManualDataSource)rq   error_message
error_typez3[ERROR] ManualDataSource does not exist for source FzP================================================================================z[INFO] Parsing page: z%[INFO] First 500 characters of HTML:
i  
zhtml.parserrW   TrX   r@   c              3   B   K   | ]  }|j                         v   y wNr+   ).0valcontents     r   	<genexpr>z$parse_manual_data.<locals>.<genexpr>   s     J\c399;'1\   rT   z"[DEBUG] Rule matched by selector: z, type: rq   r   c              3   B   K   | ]  }|j                         v   y wr   r   )r   r   rk   s     r   r   z$parse_manual_data.<locals>.<genexpr>   s     D|syy{e+|r   z [DEBUG] Rule matched by source: z[INFO] Using selectors (type: r   z): isMain	fieldTyper/   z!
[FIELD] (isMain) Parsing field: z, fieldType: 	labelPairrB   	valueTyperA   currencyField
labelFieldz [DEBUG] labelPair-number: label=z, value=z, currency=booltrueOptions)takyesr   1falseOptions)nienor   0c              3   (   K   | ]	  }|k(    y wr    r   optr   s     r   r   z$parse_manual_data.<locals>.<genexpr>   s     ?YcsczY   c              3   (   K   | ]	  }|k(    y wr   r   r   s     r   r   z$parse_manual_data.<locals>.<genexpr>   s      Bzzr   z[DEBUG] labelPair-bool: label=z[DEBUG] labelPair-text: label=z[DEBUG] number: selector=c              3   (   K   | ]	  }|k(    y wr   r   r   s     r   r   z$parse_manual_data.<locals>.<genexpr>        ;#3#:r   c              3   (   K   | ]	  }|k(    y wr   r   r   s     r   r   z$parse_manual_data.<locals>.<genexpr>        >:CSCZ:r   z[DEBUG] bool: selector=
paragraphs)	separatorr*   z[DEBUG] text: selector=z
[FIELD] Parsing field: imagesz[DEBUG] images: selector=z, found=z linksc              3   (   K   | ]	  }|k(    y wr   r   r   s     r   r   z$parse_manual_data.<locals>.<genexpr>9  r   r   c              3   (   K   | ]	  }|k(    y wr   r   r   s     r   r   z$parse_manual_data.<locals>.<genexpr>;  r   r   c              3   (   K   | ]	  }|k(    y wr   r   r   s     r   r   z$parse_manual_data.<locals>.<genexpr>W  s     7YcsczYr   c              3   (   K   | ]	  }|k(    y wr   r   r   s     r   r   z$parse_manual_data.<locals>.<genexpr>Y  s     :zzr   fromMainz[DEBUG] fromMain: base_field=splitByr6   
splitIndexr   z![DEBUG] fromMain split: split_by=z, split_index=z, value=None (out of range)boolKeyc              3   B   K   | ]  }|j                         v   y wr   r   r   r   r   s     r   r   z$parse_manual_data.<locals>.<genexpr>  s     [=Zc399;'1=Zr   c              3   B   K   | ]  }|j                         v   y wr   r   r   s     r   r   z$parse_manual_data.<locals>.<genexpr>  s     ^?]SYY[G3?]r   z[DEBUG] boolKey: selector=z , value=None (missing, fallback=)keykeyMapc              3   B   K   | ]  }|j                         v   y wr   r   )r   variantr   s     r   r   z$parse_manual_data.<locals>.<genexpr>  s     Rgw}}'9r   z[DEBUG] key: selector=many	selectorsr   cleanOptionsz[DEBUG] many: selectors=z%[WARNING] Value not found for field: z
 (config: zValue not found: z -> z
[INFO] Extracted fields for :z    z: 
trasferredc              3   2   K   | ]  }|d dg i fvsd  y w)Nr   r8   r   )r   r   s     r   r   z$parse_manual_data.<locals>.<genexpr>  s&     \(:c$PRTVXZI[>[1(:s   _stats)found_fieldstotal_fields
found_keysmissing_keys	type_usedz[INFO] Stats: )ensure_asciiz*[SUCCESS] Data parsed and saved for page: z&[EXCEPTION] Error while parsing page: ManualParsingErroru   Błąd parsowania ogłoszenia z (z)

u   ❌ Błąd parsowania: )subject)6rq   manual_data_source_fetcherr   DoesNotExistr   objectscreateprintr|   r]   r   rulesr^   r_   r+   r   ra   re   anyr`   rb   rc   rd   r   rx   keysr-   rP   rE   r*   r   r   r}   rM   rw   r,   
ValueErrorhasattrinactiverl   	is_activer   inactive_datesumvaluesraw_data
parse_datais_completesaver>   	traceback
format_excr   title)3rf   strictrq   manual_configrN   	extractedmain_valuesselected_typer   rg   rj   match_valuesall_selectorsr   
field_namer   
field_type
text_valuerB   raw
value_typerA   	label_valrC   	true_opts
false_optsrW   
base_field
base_valuesplit_bysplit_indexrJ   partskey_mapmatchedfinal_valuevariantsselectors_listcombinedsel
clean_itemr1   r2   r   source_fieldrQ   r   r   r   r   rk   s3                                                   @@@r   parse_manual_datar      s+   [[F	*0*K*KKf%dhhZ016tyy#6GrJKTYY6	 ##)rDT!//$z*:;%..T.:@@BG#'88GR#8L!,4(4~J\JJ(,V B4
CSBTT\]j\klmT!d8nb9eT* JJu-E]E
((*#xx4lC0$0>LD|DD$(LM<T(^<LHUbTcde3 6 &//mT*}R_A_%m4I%I.}/I	.J#dS\SaSaScNdMefg #,//"3Jzz(##ZZV<
!
::,mT^S_`a ,"JJw/E0u=C!'K!@J!X-6Ns6S3	8!::o68AIIf_&=>!::l3	>GIf\&:;%+
 @xPZ|[fgofpqr#v-"yb//1779$*JJ}>Y$Z	%+ZZ@[%\
?Y??)-J  Bz BB).J)>v)FJ >ugXj\Z[%(
 >ugXj\Z[8+%zz*5H"ooh7G:A'***6G\]cGdC2J32O/FIxzz/2x=E	&"9:zz,/I:C	&"67!'J5hZx
|S^_g^hij6)%zz*5H"ooh7G:A'***6G\]cGdC9"++-335C &

=:U VI!'N<W!XJ;;;%)
>:>>%*
%:6%B
3H:Xj\RS6)%zz*5H"ooh7G!::lE:)0)9)9DPT)9)UJ)0)9)9)9)EJ%:6%B
3H:Xj\RS +5J'(2	*%Y #4^ #,//"3Jzz(#
  K8JJ-j\zlSTX%!::j10x@
1(8C
OCTTZ[\(2	*%[(

7+,T59#ZZV<
)2J32O/FIxzz/2x=E	&"9:zz,/I:C	&"67!'J<UG8J<Wbckblmn6)9"++-335C &

=:U VI!'N<W!XJ;;;%)
>:>>%*
%:6%B
:5'*VW!$J:5'*VWx'!::j1//(36=g&&T&2CXY_C`.Fs.K+	8::o.89AIf_56::l+	6?If\23#
1(8J<{[cZdefv%!::j1//(36=g&&T&2CXY_C`yb'')//1"JJ}6QR	#ZZ8ST
7Y77!%J:z::!&J!6v!>J/z*NOv%#ZZ
3
(__Z8
5j\*VW!::i0eIMM*b4Q4U4UV_ad4e$jj6+"90:0@0@0JK0J1QWWY0JEKK4#e*4%*;%7
 A(>ZeYffnoynz{|%)
 A(>ZeYf  gB  C  D!+J!6v!>Jv%!::j1//(3zz,6%,%5%5D%5%Q
%,%5%5D%5%A
!6v!>J/z*NO y(!::j1//(3%..T.:@@BG[VZZWY=Z[[%)
^vzz.Z\?]^^%*
%:6%B
6xjUV!6v!>J6xj@`ak`llmnou$!::j1//(3%..T.:@@BG$jj26G"G18-XRRR&1G! 2A ")J28*HZLQR!6v!>J28*<\]g\hhijkv%!'K!<)C"ooc2G C'*:*:SPT*:*U$UU * #)**^R"@J'//
B?H #A#>>+)1X7LV7T
00@UV*,=j\TZS[[\]^ #4ZLVH!MNN$.Ij!u #4x 	.txxj:;OO%DAqD2aS/" & #*-r"J"Pb!W!W!YCt\*!(|!<	# "Z
 '//524~6"DN!$D 9~\	(8(8(:\\(()2):l):Aa8mPQZ^`bdfhjYkPk1):l+4??+<j+<41aXRSX\^`bdfhWiRiQ+<j&3)
	( 	y2345 

95A#		:488*BvhOPO
 (( ""))E0 	* 	

 	CF8LMn LD mj  6txxj9CWCWCYBZ[\""))#..0+ 	* 	

 	 ,TXXJbeIL`L`LbKcd-fll^<	
 s   { B| *A| | $B| 1B| ]!| -|H8| =A(| &D| =B| |
|
|
| 2| |	|A4| A||| B3r   )T)rc   r   r9   bs4r   django.utils.timezoner   extractly.modelsr   r   r   manual_agregator.notificationsr   r   r   r3   rE   rP   rx   r   rl   r}   r   r   r   r   <module>r      sf      	  % W W B
,#*     L8Y0 Y$ Yr   