Update for code

aptsunny · Jan 31, 2024 · b0bbed5 · b0bbed5
1 parent 244a247
commit b0bbed5
Show file tree

Hide file tree

Showing 81 changed files with 9,718 additions and 6 deletions.
diff --git a/MobileAgent/__pycache__/api.cpython-310.pyc b/MobileAgent/__pycache__/api.cpython-310.pyc
diff --git a/MobileAgent/__pycache__/chat.cpython-310.pyc b/MobileAgent/__pycache__/chat.cpython-310.pyc
diff --git a/MobileAgent/__pycache__/controller.cpython-310.pyc b/MobileAgent/__pycache__/controller.cpython-310.pyc
diff --git a/MobileAgent/__pycache__/crop.cpython-310.pyc b/MobileAgent/__pycache__/crop.cpython-310.pyc
diff --git a/MobileAgent/__pycache__/icon_localization.cpython-310.pyc b/MobileAgent/__pycache__/icon_localization.cpython-310.pyc
diff --git a/MobileAgent/__pycache__/prompt.cpython-310.pyc b/MobileAgent/__pycache__/prompt.cpython-310.pyc
diff --git a/MobileAgent/__pycache__/text_localization.cpython-310.pyc b/MobileAgent/__pycache__/text_localization.cpython-310.pyc
diff --git a/MobileAgent/api.py b/MobileAgent/api.py
@@ -0,0 +1,35 @@
+import base64
+import requests
+
+def encode_image(image_path):
+    with open(image_path, "rb") as image_file:
+        return base64.b64encode(image_file.read()).decode('utf-8')
+
+
+def inference_chat(chat, API_TOKEN):    
+    api_url = 'https://api.openai.com/v1/chat/completions'
+    headers = {
+        "Content-Type": "application/json",
+        "Authorization": f"Bearer {API_TOKEN}"
+    }
+
+    data = {
+        "model": 'gpt-4-vision-preview',
+        "messages": [],
+        "max_tokens": 2048,
+    }
+
+    for role, content in chat:
+        data["messages"].append({"role": role, "content": content})
+
+    while 1:
+        try:
+            res = requests.post(api_url, headers=headers, json=data)
+            res = res.json()['choices'][0]['message']['content']
+        except:
+            print("Network Error:")
+            print(res)
+        else:
+            break
+
+    return res
diff --git a/MobileAgent/chat.py b/MobileAgent/chat.py
@@ -0,0 +1,68 @@
+import copy
+from MobileAgent.api import encode_image
+
+
+def init_chat(instruction):
+    operation_history = []
+    sysetm_prompt = "You are a helpful phone operating assistant. You need to help me operate the phone to complete my instruction.\n"
+    sysetm_prompt += f"My instruction is: {instruction}"
+    operation_history.append(["user", [{"type": "text", "text": sysetm_prompt}]])
+    operation_history.append(["assistant", [{"type": "text", "text": "Sure. How can I help you?"}]])
+    return operation_history
+
+
+def add_response(role, prompt, chat_history, image=None):
+    new_chat_history = copy.deepcopy(chat_history)
+    if image:
+        base64_image = encode_image(image)
+        content = [
+            {
+                "type": "text", 
+                "text": prompt
+            },
+            {
+                "type": "image_url", 
+                "image_url": {
+                    "url": f"data:image/jpeg;base64,{base64_image}"
+                }
+            },
+        ]
+    else:
+        content = [
+            {
+            "type": "text", 
+            "text": prompt
+            },
+        ]
+    new_chat_history.append([role, content])
+    return new_chat_history
+
+
+def add_multiimage_response(role, prompt, chat_history, images):
+    new_chat_history = copy.deepcopy(chat_history)
+    content = [
+        {
+            "type": "text", 
+            "text": prompt
+        },
+    ]
+    for image in images:
+        base64_image = encode_image(image)
+        this_content = {
+            "type": "image_url", 
+            "image_url": {
+                "url": f"data:image/jpeg;base64,{base64_image}"
+            }
+        }
+        content.append(this_content)
+
+    new_chat_history.append([role, content])
+    return new_chat_history
+
+
+def print_status(chat_history):
+    print("*"*100)
+    for chat in chat_history:
+        print("role:", chat[0])
+        print(chat[1][0]["text"] + "<image>"*(len(chat[1])-1))
+    print("*"*100)
diff --git a/MobileAgent/controller.py b/MobileAgent/controller.py
@@ -0,0 +1,84 @@
+import time
+import subprocess
+from PIL import Image
+
+
+def get_size(adb_path):
+    command = adb_path + " shell wm size"
+    result = subprocess.run(command, capture_output=True, text=True, shell=True)
+    resolution_line = result.stdout.strip().split('\n')[-1]
+    width, height = map(int, resolution_line.split(' ')[-1].split('x'))
+    return width, height
+
+
+def get_screenshot(adb_path):
+    command = adb_path + " shell rm /sdcard/screenshot.png"
+    subprocess.run(command, capture_output=True, text=True, shell=True)
+    time.sleep(0.5)
+    command = adb_path + " shell screencap -p /sdcard/screenshot.png"
+    subprocess.run(command, capture_output=True, text=True, shell=True)
+    time.sleep(0.5)
+    command = adb_path + " pull /sdcard/screenshot.png ./screenshot"
+    subprocess.run(command, capture_output=True, text=True, shell=True)
+    image_path = "./screenshot/screenshot.png"
+    save_path = "./screenshot/screenshot.jpg"
+    image = Image.open(image_path)
+    original_width, original_height = image.size
+    new_width = int(original_width * 0.5)
+    new_height = int(original_height * 0.5)
+    resized_image = image.resize((new_width, new_height))
+    resized_image.convert("RGB").save(save_path, "JPEG")
+    time.sleep(1)
+
+
+def tap(adb_path, x, y, px, py):
+    w = px
+    h = py
+    ax = int(x*w)
+    ay = int(y*h)
+    command = adb_path + f" shell input tap {ax} {ay}"
+    subprocess.run(command, capture_output=True, text=True, shell=True)
+    time.sleep(1)
+
+
+def type(adb_path, text):
+    text = text.replace("\\n", "_").replace("\n", "_")
+    for char in text:
+        if char == ' ':
+            command = adb_path + f" shell input text %s"
+            subprocess.run(command, capture_output=True, text=True, shell=True)
+        elif char == '_':
+            command = adb_path + f" shell input keyevent 66"
+            subprocess.run(command, capture_output=True, text=True, shell=True)
+        elif 'a' <= char <= 'z' or 'A' <= char <= 'Z' or char.isdigit():
+            command = adb_path + f" shell input text {char}"
+            subprocess.run(command, capture_output=True, text=True, shell=True)
+        elif char in '-.,!?@\'°/:;()':
+            command = adb_path + f" shell input text \"{char}\""
+            subprocess.run(command, capture_output=True, text=True, shell=True)
+        else:
+            command = adb_path + f" shell am broadcast -a ADB_INPUT_TEXT --es msg \"{char}\""
+            subprocess.run(command, capture_output=True, text=True, shell=True)
+    time.sleep(1)
+
+
+def slide(adb_path, action, x, y):
+    if "down" in action:
+        command = adb_path + f" shell input swipe {int(x/2)} {int(y/2)} {int(x/2)} {int(y/4)} 500"
+        subprocess.run(command, capture_output=True, text=True, shell=True)
+    elif "up" in action:
+        command = adb_path + f" shell input swipe {int(x/2)} {int(y/2)} {int(x/2)} {int(3*y/4)} 500"
+        subprocess.run(command, capture_output=True, text=True, shell=True)
+    time.sleep(1)
+
+
+def back(adb_path):
+    command = adb_path + f" shell input keyevent 4"
+    subprocess.run(command, capture_output=True, text=True, shell=True)
+    time.sleep(1)
+
+
+def back_to_desktop(adb_path):
+    command = adb_path + f" shell am start -a android.intent.action.MAIN -c android.intent.category.HOME"
+    subprocess.run(command, capture_output=True, text=True, shell=True)
+    time.sleep(1)
diff --git a/MobileAgent/crop.py b/MobileAgent/crop.py
@@ -0,0 +1,141 @@
+import math
+import cv2
+import numpy as np
+from PIL import Image, ImageDraw, ImageFont
+import clip
+import torch
+
+
+def crop_image(img, position):
+    def distance(x1,y1,x2,y2):
+        return math.sqrt(pow(x1 - x2, 2) + pow(y1 - y2, 2))    
+    position = position.tolist()
+    for i in range(4):
+        for j in range(i+1, 4):
+            if(position[i][0] > position[j][0]):
+                tmp = position[j]
+                position[j] = position[i]
+                position[i] = tmp
+    if position[0][1] > position[1][1]:
+        tmp = position[0]
+        position[0] = position[1]
+        position[1] = tmp
+
+    if position[2][1] > position[3][1]:
+        tmp = position[2]
+        position[2] = position[3]
+        position[3] = tmp
+
+    x1, y1 = position[0][0], position[0][1]
+    x2, y2 = position[2][0], position[2][1]
+    x3, y3 = position[3][0], position[3][1]
+    x4, y4 = position[1][0], position[1][1]
+
+    corners = np.zeros((4,2), np.float32)
+    corners[0] = [x1, y1]
+    corners[1] = [x2, y2]
+    corners[2] = [x4, y4]
+    corners[3] = [x3, y3]
+
+    img_width = distance((x1+x4)/2, (y1+y4)/2, (x2+x3)/2, (y2+y3)/2)
+    img_height = distance((x1+x2)/2, (y1+y2)/2, (x4+x3)/2, (y4+y3)/2)
+
+    corners_trans = np.zeros((4,2), np.float32)
+    corners_trans[0] = [0, 0]
+    corners_trans[1] = [img_width - 1, 0]
+    corners_trans[2] = [0, img_height - 1]
+    corners_trans[3] = [img_width - 1, img_height - 1]
+
+    transform = cv2.getPerspectiveTransform(corners, corners_trans)
+    dst = cv2.warpPerspective(img, transform, (int(img_width), int(img_height)))
+    return dst
+
+
+def calculate_size(box):
+    return (box[2]-box[0]) * (box[3]-box[1])
+
+
+def calculate_iou(box1, box2):
+    xA = max(box1[0], box2[0])
+    yA = max(box1[1], box2[1])
+    xB = min(box1[2], box2[2])
+    yB = min(box1[3], box2[3])
+
+    interArea = max(0, xB - xA) * max(0, yB - yA)
+    box1Area = (box1[2] - box1[0]) * (box1[3] - box1[1])
+    box2Area = (box2[2] - box2[0]) * (box2[3] - box2[1])
+    unionArea = box1Area + box2Area - interArea
+    iou = interArea / unionArea
+
+    return iou
+
+
+def crop(image, box, i, text_data=None):
+    image = Image.open(image)
+
+    if text_data:
+        draw = ImageDraw.Draw(image)
+        draw.rectangle(((text_data[0], text_data[1]), (text_data[2], text_data[3])), outline="red", width=5)
+        font_size = int((text_data[3] - text_data[1])*0.75)
+        font = ImageFont.truetype("arial.ttf", font_size)
+        draw.text((text_data[0]+5, text_data[1]+5), str(i), font=font, fill="red")
+
+    cropped_image = image.crop(box)
+    cropped_image.save(f"./temp/{i}.jpg")
+
+
+def in_box(box, target):
+    if (box[0] > target[0]) and (box[1] > target[1]) and (box[2] < target[2]) and (box[3] < target[3]):
+        return True
+    else:
+        return False
+
+
+def crop_for_clip(image, box, i, position):
+    image = Image.open(image)
+    w, h = image.size
+    if position == "left":
+        bound = [0, 0, w/2, h]
+    elif position == "right":
+        bound = [w/2, 0, w, h]
+    elif position == "top":
+        bound = [0, 0, w, h/2]
+    elif position == "bottom":
+        bound = [0, h/2, w, h]
+    elif position == "top left":
+        bound = [0, 0, w/2, h/2]
+    elif position == "top right":
+        bound = [w/2, 0, w, h/2]
+    elif position == "bottom left":
+        bound = [0, h/2, w/2, h]
+    elif position == "bottom right":
+        bound = [w/2, h/2, w, h]
+    else:
+        bound = [0, 0, w, h]
+
+    if in_box(box, bound):
+        cropped_image = image.crop(box)
+        cropped_image.save(f"./temp/{i}.jpg")
+        return True
+    else:
+        return False
+
+
+def clip_for_icon(clip_model, clip_preprocess, images, prompt):
+    image_features = []
+    for image_file in images:
+        image = clip_preprocess(Image.open(image_file)).unsqueeze(0)
+        image_feature = clip_model.encode_image(image)
+        image_features.append(image_feature)
+    image_features = torch.cat(image_features)
+
+    text = clip.tokenize([prompt])
+    text_features = clip_model.encode_text(text)
+
+    image_features /= image_features.norm(dim=-1, keepdim=True)
+    text_features /= text_features.norm(dim=-1, keepdim=True)
+    similarity = (100.0 * image_features @ text_features.T).softmax(dim=0).squeeze(0)
+    _, max_pos = torch.max(similarity, dim=0)
+    pos = max_pos.item()
+
+    return pos