
    wi^                        d dl Z d dlZd dlZd dlZd dlmZmZmZ d dlm	Z	 d dl
mZ d dlmZmZ d dl
mZ d dlmZmZmZmZ d dlmZ d dlmZ d d	lmZ  ej4                  e      Zd
edefdZd Z de!de!fdZ"dedee   de!de!de!f
dZ#	 	 	 	 	 	 	 	 	 	 	 	 	 	 d$de$dee$   de!de!de!dee$   deee      de!de!deee      de!de!d ee!   de$fd!Z%d"efd#Z&y)%    N)OptionalSetIterable)urlparse)Q)transactionmodels)NetworkMonitoredPage	AdsManualSourceManualManualParserLog)map_data_to_manual_model)parse_manual_data)NLPEnhancedExtractorsreturnc                 .   | si S 	 t        j                  |       S # t         j                  $ rf t        j                  dd|       }	 t        j                  |xs d      cY S # t         j                  $ r t
        j                  d       i cY cY S w xY ww xY w)Nz[\x00-\x1F\x7F] {}z)Failed to parse JSON even after cleaning.)jsonloadsJSONDecodeErrorresubloggerwarning)r   cleaneds     1/var/www/extractly/manual_agregator/run_parser.pysafe_json_loadsr      s    	zz!} &&+R3	::go..## 	NNFGI		s-    *BA!B!*BBBBc                    | si S | }t        | t              r	 t        j                  |       }t        |t
        t        f      ri S t        |t              rZt        j                  j                  D ch c]  }|j                   }}|j                         D ci c]  \  }}||v s|| c}}S i S # t        $ r i cY S w xY wc c}w c c}}w )u/  
    Zwraca dict do filtrowania querysetu stron.
    Akceptuje:
      - None -> {}
      - str(JSON) -> jeśli dict, zwróć go; jeśli list -> zignoruj (to layout rules)
      - dict -> przefiltruj po polach modelu i zwróć
      - list/tuple -> zignoruj (to layout rules)
      - inne typy -> {}
    )
isinstancestrr   r   	Exceptionlisttupledictr
   _metafieldsnameitems)	raw_rulesparsedfmodel_fieldskvs         r   _normalize_rules_for_querysetr1   )   s     	F)S!	ZZ	*F &4-(	&$(<(B(B(I(IJ(I1(IJ!'EA13D1EEI  	I	 KEs#   B1 1CC&C1B?>B?include_zeroc                     t        | t              r| j                         dk(  S | yt        | t        t        f      r|r| dk(  S y)Nr   Tr   F)r!   r"   stripintfloat)valr2   s     r   _is_empty_valuer8   L   sA    #syy{b  
{#U|$ax    mappedr(   require_allc                     g }|D ].  }|}|j                  t        | j                  |      |             0 |rt        |      S t	        |      S N)appendr8   getallany)r:   r(   r;   r2   checksr-   keys          r   _matches_empty_policyrD   X   sH    Fofjjo|DE  &3v;63v;6r9   limitonly_iddry_runenableforce	manual_idforce_namesonly_activeonly_inactiveonly_empty_fieldsrequire_all_emptyempty_include_zerouse_description_scraperc                 :  2345 d}|r;t        t        j                  j                  |d            }|sEt	        d| d      t        t        j                  j                  d            }|st	        d      t        d       |D ]?  }t        d	|j                   d
t        |dt        |             d|j                   d       A t        d       |r|D ch c]  }|j                          c}nd}t        |d      D ]v  \  }}t        t        |dd      xs d      j                         }t        t        |j                  dd      xs d      j                         }t        t        |j                  dd      xs d      j                         }|j                         |j                         |j                         h4t        |      xs |duxr t        4fd|D              }t        d| dt!        |       d|j                   d
|xs | d|j                   d| dt        |       dt        |       d       t#        t        |dd            }|rt$        j                  j                  |j                        j'                  t)        d      t)        d      z  t)        d      z  t)        d      z  t)        d      z  t)        d       z  t)        d!      z  t)        d"!      z  t)        d!      z  t)        d!      z  t)        d!      z        j+                  d#      }nt-        dZd|j                  d$|}t%        j                  j                  dZi |j'                  t)        d      t)        d      z  t)        d      z  t)        d      z  t)        d      z  t)        d       z  t)        d!      z  t)        d!      z  t)        d!      z  t)        d!      z        j+                  d#      }|r|j                  d%      }n|	r|j                  d&%      }|
rm|rkd't        d(t(        ffd)}|
D cg c]
  } ||       }}|rt)               }|D ]  } ||r| n| z  } nt)               }|D ]  } || z  }	 |j                  |      }|r|j                  |*      }|r|j                  |+      }|j/                         }!t        d,|j                   d
t        |d|       d-|! d.|  d	       |d|  }"d}#d5d[d/t        d0t        d1t        d(t        fd22d/t        d(t        f2fd33d4t0        d/t        d(df35fd5}$t        |"d      D ]  \  }%}&	 t        d6|% dt3        |!|        d7|&j4                          t7        |&d&8      }'|'st8        j;                  d9|&j4                          t<        j                  j?                  |&|j                  |j@                  tC        |&j4                        jD                  |&j4                  d"d:d;d&d&<
       |&jF                  xs tI        |&jJ                  xs d      }(|(st8        j;                  d=|&j4                          t<        j                  j?                  |&|j                  |j@                  tC        |&j4                        jD                  |&j4                  d>d?d@dd&<
       wtM        |(|&      })|
rwtO        |)|
|      sit<        j                  j?                  |&|j                  |j@                  tC        |&j4                        jD                  |&j4                  dAdB|
 dCdd&<
       |r't8        jQ                  dD|&j4                   dE|
        %tS        jT                         5  |rA|&jV                  r5	 |&jX                  j[                          d|&_,        |&j_                  dFgG       t1        j                  j>                  dZi |)}*|*|&_,        |&j_                  dFgG       ddd        |$*|       |#dz  }#t8        jQ                  dH|&j4                           t        dT|j                   dU|# dt3        |!|        dV       ||#z  }y t        dW| dXt!        |       dY       |S c c}w c c}w # t\        $ r Y w xY w# 1 sw Y   xY w# t\        $ r}+ta        t0        dItc               v r)ni       },te        jf                         }-|,rdJji                  dK |,D              }.t8        jk                  dL|&j4                   dM|+ dN|.        t<        j                  j?                  |&|j                  |j@                  tC        |&j4                        jD                  |&j4                  dOt        |+       dP|- t        |+      ddQ  |,D /01cg c]  \  }/}0}1|/|0|1dR nc c}1}0}/w c}1}0}/dd&S       nt8        jk                  dL|&j4                   dM|+        t<        j                  j?                  |&|j                  |j@                  tC        |&j4                        jD                  |&j4                  d"t        |+       dP|- t        |+      ddQ  dItc               v d&<
       Y d}+~+9d}+~+ww xY w)\u;  
    ... (docstring skrócony) ...
    only_empty_fields: iter pól z {'price','title','description','address'} — filtruj do pustych.
    require_all_empty: jeśli True, wszystkie podane pola muszą być puste (inaczej: wystarczy któreś).
    empty_include_zero: jeśli True, 0 uznajemy za puste dla price.
    r   T)idrH   z!No ManualDataSource found for id=.)rH   z&No active ManualDataSource to process.z
Manuals to process:z - [z] r)   z (source_id=)r   N   titlec              3   &   K   | ]  }|v  
 y wr=    ).0ncandidates_to_matchs     r   	<genexpr>z'process_manual_queue.<locals>.<genexpr>   s     0dSca6I1IScs   z
>> START [/z] Manual: [z) | FORCED=z	 (global=z
; by-name=rules)source)sliced_html__isnull)sliced_html__exactr   z[] )html__isnull)html__exacterrorrS   )network_ad_manual__isnullr`   )	is_activeFfieldr   c                     | j                         j                         }|dk(  rt        d      }r|t        d      z  }|S |dv r#t        di d| ddit        di d| d	d
iz  S t        g       S )NpriceT) network_ad_manual__price__isnullr   )network_ad_manual__price)rW   descriptionaddressnetwork_ad_manual____isnull__exactr   )pk__inrY   )lowerr4   r   )ri   r-   qrP   s      r   q_empty_forz)process_manual_queue.<locals>.q_empty_for   s    KKM'')<4@A)Q::H==I"5aS A4HIALwSfghfiipQqsuPvLwwwB<'r9   )rS   )name__icontainsz	Manual: [u    – total to process: z listings (limit: manualrC   defaultc                     	 t        | dd       xs i }t        |t              rt        j                  |      }|j                  d      xs i }|j                  ||      }t        |      S # t        $ r |cY S w xY w)N	selectorssettings)getattrr!   r"   r   r   r?   boolr#   )rx   rC   ry   r{   r|   r7   s         r   _get_setting_boolz/process_manual_queue.<locals>._get_setting_bool   ss    #FK>D"	i- $

9 5I$==4:ll30Cy  s   A'A* *A87A8c                 2    t              S  | dd      S )NrQ   F)r~   )rx   r   rQ   s    r   _should_use_description_scraperz=process_manual_queue.<locals>._should_use_description_scraper	  s%    &2344$V-FNNr9   adc                 Z   	  |      sy t        | dd       }|sy 
t               j                  |      }|sy i }g }| j                  j	                         D ci c]  }t        |d      s|j                  | }}d }d }	|j                         D ]R  \  }
}|j                  |
      }|st        | |
d       } |||      s1 |	||      }|=|||
<   |j                  |
       T |r|j                         D ]  \  }}t        | ||        |r:t        | ddj                  |             t        |j                               dgz   }nt        |j                               }| j                  |       y y c c}w # t        $ r"}t         j#                  d|        Y d }~y d }~ww xY w)	Nrn   attnamec                     ddl m} t        ||j                  |j                  f      r+| d u xs% t        | t
              xr | j                         dk(  S | d u S )Nr   r	   r   )	django.dbr	   r!   	CharField	TextFieldr"   r4   )r7   ri   djms      r   _is_empty_fieldzYprocess_manual_queue.<locals>._merge_from_description_if_enabled.<locals>._is_empty_field%  sM    7!%#--)GH"d{Zz#s/C/Y		WYHYZ$;&r9   c                 L   ddl m} ddlm}m} ddlm}m} t        ||j                  |j                  f      ry| y t        |       }	 t        |dd       }|j                  d      sJd}	t        |t              r3|r1t        |      dz   |kD  r|dk\  r|dkD  r|d |dz
   nd|	z   }|S ||	z   }|S ||	z   }|S t        ||j                         r?| d	v r| S t        | t              r(| j#                         j%                         }
|
d
v ry|
dv ryy t        ||j&                        r&t        ||j                         s	 | d S t        |       S t        ||j(                  |j*                  f      rU| y t        | |      r| S 	 t        |       j-                  dd      j-                  dd      j-                  dd      } ||      S t        ||j4                        rr| y t        | ||f      rt        | |      r| S | j                         S t        |       j#                         }dD ]$  }	 |j7                  ||      j                         c S  y | S # t        $ r d }Y w xY w# t        $ r Y y w xY w# |t.        t0        f$ r 	 t3        |       cY S # t        $ r Y Y y w xY ww xY w# t        $ r Y w xY w)Nr   r   )DecimalInvalidOperation)datetimedate
max_length*rV   r   )TF)true1takyesT)false0nienoF    rc   ,rT   )z%Y-%m-%dz%d.%m.%Yz%d/%m/%Yz%d-%m-%Y)r   r	   decimalr   r   r   r   r!   r   r   r"   r}   r#   endswithr5   lenBooleanFieldr4   rt   IntegerFieldDecimalField
FloatFieldreplace
ValueError	TypeErrorr6   	DateFieldstrptime)valueri   r   r   r   r   r   r   max_lenmarkerr0   fmts               r   _coerce_value_for_fieldzaprocess_manual_queue.<locals>._merge_from_description_if_enabled.<locals>._coerce_value_for_field,  s   7A7!%#--)GH =#'J+&-e\4&HG  !zz#%(F)'37G#&q6A:#7'.!|AH1Q}1-=RTX^,^
  ! )*F
A  ! %&J !%)9)9: M1#(L%eS1 % 3 3 5A $??'+ $??',#!%)9)9::eUXUeUeCf(+0=4Hc%jH "%#*:*:CNN)KL =#'%eW5#(L, #E
 2 263 ? G GR P X XY\^a bA#*1:- "%7 =#'%eh-=>,6ud,C5UUJ,,.#SC)'/'8'8C'@'E'E'G G $T
  $ Ly  ) +&*G+:  ) (#'( !1*iH ,,',U| 3#, ,'+,,  $- ) ()sm   I 4I 8
I 7AI% JII	I"!I"%J6
J J	JJJJ	J#"J#description_scraped_variablesr   update_fieldsz-Description enrichment skipped due to error: )r}   r   extract_allr'   
get_fieldshasattrr)   r*   r?   r>   setattrjoinr$   keyssaver#   r   r   )r   rx   descresultsupdate_payloadupdated_by_descriptionr-   r.   r   r   rC   raw_valri   current_valcoercedr/   r0   update_fields_final_er   	extractors                      r   "_merge_from_description_if_enabledz@process_manual_queue.<locals>._merge_from_description_if_enabled  s   ~U6v>r=$7$ 4 6I#//5 "$)+& 46883F3F3Hb3HaGTUW`La	3Hb'H!T %,MMOLC(,,S1E  ")"c4"8K*;> 5guEG *1N3'*11#6 %4 " . 4 4 61Aq) !7 .$CSXXNdEef.2>3F3F3H.IMlLm.m+.2>3F3F3H.I+GG*=G> "A  cT  U!NrdSTTUs?   E? E? E?  E? E:.E:=C;E? :E? ?	F*F%%F*u     → [z] Processing listing: )strictzParser returned False for z1parse_manual_data returned False - parsing failedzParsing failed)
network_pager`   source_namedomainurl	skip_typeerror_messageerror_summaryparsed_successfullysaved_to_ads_manualz#No data in raw_data/parse_data for no_dataz,No data in raw_data/parse_data after parsingzEmpty parsed dataintentionalz Skipped by empty fields policy: zFiltered by empty fields policyz[DRY RUN] Ready to save: z | filtered by empty=network_ad_manualr   zSaved listing: r:   z | c              3   :   K   | ]  \  }}}| d | d| d  yw): z	 (value='z')NrY   )rZ   fldreasonpreviews       r   r]   z'process_manual_queue.<locals>.<genexpr>  s0     (slrThTWY_ahC56()G9B)Olrs   z
Error for r   z | fields -> 
validationz

i  )ri   r   r   )r   r`   r   r   r   r   r   r   field_errorsr   r   z
<< END of Manual: [z] processed z
 listings.z
SUMMARY: processed z listings for z manual(s).
rY   )F)6r$   r   objectsfilterr   printrS   r}   r"   	source_idrt   	enumerater4   r`   r~   rA   r   r1   r
   excluder   order_byr&   countr   minr   r   r   r   r   creater)   r   netloc
parse_datar   raw_datar   rD   infor   atomicnetwork_ad_manual_idr   deleter#   r   _guess_offending_fieldslocals	traceback
format_excr   rf   )6rE   name_filtersrF   rG   rH   rI   rJ   rK   rL   rM   rN   rO   rP   rQ   processed_totalmanualsmr[   force_names_normidx
manual_objmanual_namesource_titler   forced_hereqs_rulespages_qsfilter_argsrv   r-   qs_partsq_totalparttotal_to_processpagesprocessed_countr   ipageokparsed_datar:   r   eissueserror_tracedetailsr   r   r   r   r   r\   r   s6               ``                                    @@@@r   process_manual_queuer  `   st
   0 O|++22i2MN@1MNN|++22$2?@EFF	
!"QTTF"WQA78Q[[MQRST 	"I -8K(KqK(T  %Wa0Z'*fb9?R@FFH7:#4#4grBHbIOOQ'*"3"3VR@FBGMMO  
 5k 
D(dS0dSc0d-d 	 	3%qWk*--(j)j6J6J5K L!])DK=
4HXCYBZZ[]	
 1Wd1ST$,,33:;L;L3M$/,-./ ./ -	.
 4() "%& '*+ $'( $'	( #&
' $ $  *.!(( K %,,33BkB$/,-./ ./ -	.
 4() "%& $'( $'( #&	' $ $ 6H7H 
(3 
(1 
( 1BB0A1A0AHB #$DwtD8G % #$DtOG %w/H'2H|DH#>>+
bVZ)P(Q R!!1 22DUG1N	

 %  			l 		 		t 		X\ 			OL 	OT 	O@	U9 @	Ul @	UW[ @	UD !*GAtrs!C(8%$@#AAWX\X`X`Wabc&tE:NN%?z#JK#++22%))00$.OO'188 HH")&Y&6,1,1 3  "ooWAVRV1W"NN%H
#ST#++22%))00$.OO'188 HH"+&T&9,0,1 3  1+tD %09JL]_qr'//66)-#-#4#4(2#+DHH#5#<#< $&3,LM^L_*`*K0405 7  !KK";DHH:EZ[lZm no '')"t'@'@! 2299; 26.		1D0E	F"**11;F;B-/D*II-@,AIB * 32zB1$odhhZ89] +j 	%jmm_L@QQRSVWginSoRppz{|?*u 1x 
!/!2.Wm
\]A 	)n Cb  ) ! !	 *)&  #0hRXRZFZF`bc'224#jj(slr(ssGLL:dhhZr!M'!ST#++22%))00$.OO'188 HH".),Q[M&B),Qw}%~w}_s_bdjlsvRY&Zw}%~%~,0,1 3  LL:dhhZr!!=>#++22%))00$.OO'188 HH")),Q[M&B),Q,4,@,1 3 1#s   bb $B=c "B/c Bc 'c c b4%b%?Ab48c %	b1	.b40b1	1b44b=9c  
j
C1j;g"Cjjpayloadc           	          g }| j                   j                         D ]m  }t        |dd      }|r||vr||   }	 t        |t        j
                  t        j                  f      rgt        |dd      }||rt        |      }t        |      |kD  r|j                  |dt        |       d| d|dd t        |      dkD  rdnd	z   f       nt        |t        j                        r&|d
vr|j                  |d| dt        |      f       nt        |t        j                        r+t        |t        j                        s|dvrG	 t        |       n:t        |t        j                  t        j                  f      r|dvr	 t!        |       p |S # t        $ r$ |j                  |d| dt        |      f       Y 1w xY w# t        $ r$ |j                  |d| dt        |      f       Y aw xY w# t        $ r Y w xY w)u   
    Zwraca listę (field, reason, preview) dla oczywistych problemów:
    - Char/URL: długość > max_length
    - Boolean: wartość inna niż True/False/None
    - Integer/Float: nie da się zrzutować
    r   Nr   z
too long (r^   rU   x   u   …r   )TFNzinvalid boolean '')Nr   zinvalid integer 'zinvalid float/decimal ')r'   r   r}   r!   r	   r   URLFieldr"   r   r>   r   r   r5   r#   r   r   r6   )	model_clsr  r  r-   r)   r7   r   r   s           r   r   r     s    F__'')q)T*t7*dm 	!f..@A!!\48?wCA1v'tz#a&7)1-MqQURUwcfghcilocoZ_uwOx&yz Av22311MM4+<SE)CSX"NO Av223Jq&J]J]<^j(TC
 A 1 163F3FGHj(Zc
C *R M % Tt/@Q-GS&RST % Zt/Fse1-MsSVx&XYZ  		sT   D	G0F/G0 G *F=:G0<F==G0 *G-*G0,G--G00	G=<G=)ip  NNFTFNNFFNFFN)'r   loggingr   r   typingr   r   r   urllib.parser   django.db.modelsr   r   r   r	   extractly.modelsr
   r   r   r   manual_agregator.parser.utilsr   manual_agregator.parserr   8manual_agregator.description_extrator.core.nlp_extractorr   	getLogger__name__r   r"   r&   r   r1   r~   r8   rD   r5   r  r   rY   r9   r   <module>r     s     	  * * !  *   C 5 Y			8	$s t F
t 
 
7$ 7 7D 7`d 7im 7 !#&*15#$.2!jj c]j 	j
 j j }j #c(#j j j  .j j j  &d^!j" 	#jz1 1r9   