import sys, asyncio, json, re, time, os
sys.path.insert(0,'/opt/data/scripts')
import cbm_cdp
LOG='/opt/data/agent-state/memory/ig-encourage-agent-log.json'
ACCOUNTS=['proverbs31ministries','lysaterkeurst','youversion','ourdailybread','bibleproject','desiringgod','craiggroeschel','life.church','shereadstruth','wellwateredwomen','christianitytoday']
PROMO=re.compile(r'link in bio|comment\s+[A-Z"“]|comment .*link|download|sale|shop|pre[- ]?order|tickets?|tour|stream|out now|giveaway|podcast|register|webinar|conference|donate|fundraiser|subscribe',re.I)
SENSITIVE=re.compile(r'suicide|self[- ]?harm|funeral|died|death|murder|abuse|trauma|grief|grieving|rip|condolence',re.I)
UNSAFE=re.compile(r'trump|biden|election|politic|lgbt|transgender|sexual|porn|onlyfans|israel|gaza|war|protest',re.I)
WARN=re.compile(r'try again later|captcha|suspicious|checkpoint|temporarily blocked|restrict certain activity|automated behavior|confirm it|unusual',re.I)
CHRIST=re.compile(r'\b(God|Jesus|Christ|Lord|Bible|Scripture|pray|prayer|grace|faith|gospel|mercy|hope|worship|church|Psalm|Romans|Isaiah|Colossians|Father|Holy Spirit|Proverbs)\b',re.I)

def post_id(url):
 m=re.search(r'/(?:p|reel)/([A-Za-z0-9_-]+)',url); return m.group(1) if m else None

def load_done():
 try: log=json.load(open(LOG))
 except: return set()
 return {c.get('postId') for c in log.get('comments',[]) if c.get('verified') and c.get('postId')}
async def extract_links(b, limit=8):
 return await b.eval(f"""(() => Array.from(new Set([...document.querySelectorAll('a[href*=\"/p/\"],a[href*=\"/reel/\"]')].map(a=>a.href.split('?')[0]).filter(h=>h.includes('/p/')||h.includes('/reel/')))).slice(0,{limit}))()""")
async def main():
 cbm_cdp.launch_profile(); b=await cbm_cdp.connect(True)
 done=load_done(); out=[]; skipped=[]
 st=await cbm_cdp.page_state(b); print('STATE',json.dumps({k:st.get(k) for k in ['url','hasLogin','warning']}))
 if st.get('hasLogin') or st.get('warning') or WARN.search(st.get('text') or ''):
  print('STOP_LOGIN_OR_WARNING'); return
 sources=[]
 # current/home feed
 await b.goto('https://www.instagram.com/',5)
 for i in range(3):
  await b.eval('window.scrollBy(0, Math.floor(window.innerHeight*0.85))'); await asyncio.sleep(2)
 links=await extract_links(b,10)
 sources += [('home',u) for u in links]
 for acct in ACCOUNTS:
  await b.goto(f'https://www.instagram.com/{acct}/',5)
  await b.eval('window.scrollBy(0, 800)'); await asyncio.sleep(2)
  links=await extract_links(b,8)
  print('LINKS',acct,len(links),links[:3])
  sources += [(acct,u) for u in links]
 seen=set()
 for src,url in sources:
  if len(out)>=18: break
  pid=post_id(url)
  if not pid or pid in seen: continue
  seen.add(pid)
  if pid in done:
   skipped.append({'url':url,'source':src,'reason':'local log duplicate'}); continue
  await b.goto(url,5)
  body=await b.eval('document.body.innerText') or ''
  if WARN.search(body):
   print('STOP_WARNING_ON',url); break
  if re.search(r'vincent\.emmanuel\.lee|Vincent Emmanuel Lee', body, re.I):
   skipped.append({'url':url,'source':src,'reason':'visible Vincent comment'}); continue
  low=body[:4000]
  reason=None
  if not CHRIST.search(low): reason='not clearly Christian'
  elif PROMO.search(low): reason='promotional/app/event/link language'
  elif SENSITIVE.search(low): reason='sensitive/grief/trauma'
  elif UNSAFE.search(low): reason='political/sexual/divisive'
  if reason:
   skipped.append({'url':url,'source':src,'reason':reason,'preview':low[:300].replace('\n',' ')})
   continue
  # account from first nonempty line
  lines=[x.strip() for x in body.split('\n') if x.strip()]
  acct=lines[0] if lines else src
  out.append({'source':src,'account':acct,'postId':pid,'url':url,'text':low[:2500]})
  print('CAND',len(out),src,url)
 print('JSON_START')
 print(json.dumps({'candidates':out,'skipped':skipped[:80]},ensure_ascii=False,indent=2))
 print('JSON_END')
asyncio.run(main())
