Return to BSD News archive
Newsgroups: comp.unix.bsd Path: sserve!manuel.anu.edu.au!munnari.oz.au!uunet!pmafire!news.dell.com!swrinde!gatech!hubcap!ncrcae!ncr-sd!crash!fpm From: fpm@crash.cts.com (Frank Maclachlan) Subject: Functions are not word-aligned in 386BSD 0.1 locore.s (w/ patch) Organization: CTS Network Services (crash, ctsnet), El Cajon, CA Date: 20 Oct 92 13:32:56 PDT Message-ID: <1992Oct20.133257.29726@crash> Keywords: patch 386BSD Lines: 738 I've wanted to post this for a long time, but resisted since it doesn't *fix* anything; it simply makes certain kernel routines a tiny bit faster. I noticed that branch/function call addresses and some data items in '/sys/i386/i386/locore.s' and '/sys/i386/isa/icu.s' are not aligned on 32 bit word boundaries (0 modulo 4). This causes a small performance hit on 386DX/486 based machines. The 486, in fact, performs best when branch addresses are aligned on 16 byte boundaries (0 modulo 16) due to its burst cache fill capability; this is one of the reasons why the gcc 2.2.2 -486 option often results in larger binaries. Anyway, I modified '/sys/i386/i386/locore.s' and '/sys/i386/isa/icu.s' to align things on 4 byte boundaries. I used a macro, ALIGN32, to perform the alignment since the new version of the Gnu assembler will probably change the meaning of .align 2. I also eliminated an unnec- essary instruction in ___divsi3. I don't claim a profound performance boost, but every little bit helps! To apply the patch, change directory to /sys and type the following: patch -p0 <patch_file_name Here's the patch: diff -rc i386.ORIG/i386/locore.s i386/i386/locore.s *** i386.ORIG/i386/locore.s Wed Aug 12 21:03:05 1992 --- i386/i386/locore.s Tue Oct 20 12:56:10 1992 *************** *** 63,68 **** --- 63,69 ---- .set SYSPDROFF,0x3F8 # Page dir index of System Base #define NOP inb $0x84, %al ; inb $0x84, %al + #define ALIGN32 .align 2 /* 2^2 = 4 */ /* * PTmap is recursive pagemap at top of virtual address space. *************** *** 372,377 **** --- 373,379 ---- * Support routines for GCC */ .globl ___udivsi3 + ALIGN32 ___udivsi3: movl 4(%esp),%eax xorl %edx,%edx *************** *** 379,387 **** ret .globl ___divsi3 ___divsi3: movl 4(%esp),%eax ! xorl %edx,%edx cltd idivl 8(%esp) ret --- 381,390 ---- ret .globl ___divsi3 + ALIGN32 ___divsi3: movl 4(%esp),%eax ! #xorl %edx,%edx /* not needed - cltd sign extends into %edx */ cltd idivl 8(%esp) ret *************** *** 390,395 **** --- 393,399 ---- * I/O bus instructions via C */ .globl _inb + ALIGN32 _inb: movl 4(%esp),%edx subl %eax,%eax # clr eax NOP *************** *** 398,403 **** --- 402,408 ---- .globl _inw + ALIGN32 _inw: movl 4(%esp),%edx subl %eax,%eax # clr eax NOP *************** *** 406,411 **** --- 411,417 ---- .globl _rtcin + ALIGN32 _rtcin: movl 4(%esp),%eax outb %al,$0x70 subl %eax,%eax # clr eax *************** *** 413,418 **** --- 419,425 ---- ret .globl _outb + ALIGN32 _outb: movl 4(%esp),%edx NOP movl 8(%esp),%eax *************** *** 421,426 **** --- 428,434 ---- ret .globl _outw + ALIGN32 _outw: movl 4(%esp),%edx NOP movl 8(%esp),%eax *************** *** 433,438 **** --- 441,447 ---- */ .globl _bzero + ALIGN32 _bzero: pushl %edi movl 8(%esp),%edi *************** *** 454,459 **** --- 463,469 ---- */ .globl _fillw + ALIGN32 _fillw: pushl %edi movl 8(%esp),%eax *************** *** 466,471 **** --- 476,482 ---- ret .globl _bcopyb + ALIGN32 _bcopyb: pushl %esi pushl %edi *************** *** 486,491 **** --- 497,503 ---- */ .globl _bcopy,_ovbcopy + ALIGN32 _ovbcopy: _bcopy: pushl %esi *************** *** 507,512 **** --- 519,525 ---- popl %esi xorl %eax,%eax ret + ALIGN32 1: addl %ecx,%edi /* copy backwards. */ addl %ecx,%esi *************** *** 530,535 **** --- 543,549 ---- #ifdef notdef .globl _copyout + ALIGN32 _copyout: movl _curpcb, %eax movl $cpyflt, PCB_ONFAULT(%eax) # in case we page/protection violate *************** *** 596,601 **** --- 610,616 ---- ret .globl _copyin + ALIGN32 _copyin: movl _curpcb,%eax movl $cpyflt,PCB_ONFAULT(%eax) # in case we page/protection violate *************** *** 621,626 **** --- 636,642 ---- movl %eax,PCB_ONFAULT(%edx) ret + ALIGN32 cpyflt: popl %ebx popl %edi *************** *** 631,636 **** --- 647,653 ---- ret #else .globl _copyout + ALIGN32 _copyout: movl _curpcb,%eax movl $cpyflt,PCB_ONFAULT(%eax) # in case we page/protection violate *************** *** 655,660 **** --- 672,678 ---- ret .globl _copyin + ALIGN32 _copyin: movl _curpcb,%eax movl $cpyflt,PCB_ONFAULT(%eax) # in case we page/protection violate *************** *** 678,683 **** --- 696,702 ---- movl %eax,PCB_ONFAULT(%edx) ret + ALIGN32 cpyflt: popl %edi popl %esi movl _curpcb,%edx *************** *** 689,694 **** --- 708,714 ---- # insb(port,addr,cnt) .globl _insb + ALIGN32 _insb: pushl %edi movw 8(%esp),%dx *************** *** 705,710 **** --- 725,731 ---- # insw(port,addr,cnt) .globl _insw + ALIGN32 _insw: pushl %edi movw 8(%esp),%dx *************** *** 720,725 **** --- 741,747 ---- # outsw(port,addr,cnt) .globl _outsw + ALIGN32 _outsw: pushl %esi movw 8(%esp),%dx *************** *** 735,740 **** --- 757,763 ---- # outsb(port,addr,cnt) .globl _outsb + ALIGN32 _outsb: pushl %esi movw 8(%esp),%dx *************** *** 753,758 **** --- 776,782 ---- * void lgdt(struct region_descriptor *rdp); */ .globl _lgdt + ALIGN32 _lgdt: /* reload the descriptor table */ movl 4(%esp),%eax *************** *** 779,784 **** --- 803,809 ---- * void lidt(struct region_descriptor *rdp); */ .globl _lidt + ALIGN32 _lidt: movl 4(%esp),%eax lidt (%eax) *************** *** 788,793 **** --- 813,819 ---- * void lldt(u_short sel) */ .globl _lldt + ALIGN32 _lldt: lldt 4(%esp) ret *************** *** 796,801 **** --- 822,828 ---- * void ltr(u_short sel) */ .globl _ltr + ALIGN32 _ltr: ltr 4(%esp) ret *************** *** 805,810 **** --- 832,838 ---- */ .globl _lcr3 .globl _load_cr3 + ALIGN32 _load_cr3: _lcr3: inb $0x84,%al # check wristwatch *************** *** 816,821 **** --- 844,850 ---- # tlbflush() .globl _tlbflush + ALIGN32 _tlbflush: inb $0x84,%al # check wristwatch movl %cr3,%eax *************** *** 826,831 **** --- 855,861 ---- # lcr0(cr0) .globl _lcr0,_load_cr0 + ALIGN32 _lcr0: _load_cr0: movl 4(%esp),%eax *************** *** 834,839 **** --- 864,870 ---- # rcr0() .globl _rcr0 + ALIGN32 _rcr0: movl %cr0,%eax ret *************** *** 840,845 **** --- 871,877 ---- # rcr2() .globl _rcr2 + ALIGN32 _rcr2: movl %cr2,%eax ret *************** *** 847,852 **** --- 879,885 ---- # rcr3() .globl _rcr3 .globl __cr3 + ALIGN32 __cr3: _rcr3: movl %cr3,%eax *************** *** 854,859 **** --- 887,893 ---- # ssdtosd(*ssdp,*sdp) .globl _ssdtosd + ALIGN32 _ssdtosd: pushl %ebx movl 8(%esp),%ecx *************** *** 877,882 **** --- 911,917 ---- /* * {fu,su},{byte,word} */ + ALIGN32 ALTENTRY(fuiword) ENTRY(fuword) movl _curpcb,%ecx *************** *** 887,892 **** --- 922,928 ---- movl $0,PCB_ONFAULT(%ecx) ret + ALIGN32 ENTRY(fusword) movl _curpcb,%ecx movl $fusufault,PCB_ONFAULT(%ecx) #in case we page/protection violate *************** *** 896,901 **** --- 932,938 ---- movl $0,PCB_ONFAULT(%ecx) ret + ALIGN32 ALTENTRY(fuibyte) ENTRY(fubyte) movl _curpcb,%ecx *************** *** 906,911 **** --- 943,949 ---- movl $0,PCB_ONFAULT(%ecx) ret + ALIGN32 fusufault: movl _curpcb,%ecx xorl %eax,%eax *************** *** 913,918 **** --- 951,957 ---- decl %eax ret + ALIGN32 ALTENTRY(suiword) ENTRY(suword) movl _curpcb,%ecx *************** *** 944,949 **** --- 983,989 ---- movl %eax,PCB_ONFAULT(%ecx) #in case we page/protection violate ret + ALIGN32 ENTRY(susword) movl _curpcb,%ecx movl $fusufault,PCB_ONFAULT(%ecx) #in case we page/protection violate *************** *** 972,977 **** --- 1012,1018 ---- movl %eax,PCB_ONFAULT(%ecx) #in case we page/protection violate ret + ALIGN32 ALTENTRY(suibyte) ENTRY(subyte) movl _curpcb,%ecx *************** *** 1001,1006 **** --- 1042,1048 ---- movl %eax,PCB_ONFAULT(%ecx) #in case we page/protection violate ret + ALIGN32 ENTRY(setjmp) movl 4(%esp),%eax movl %ebx, 0(%eax) # save ebx *************** *** 1013,1018 **** --- 1055,1061 ---- xorl %eax,%eax # return (0); ret + ALIGN32 ENTRY(longjmp) movl 4(%esp),%eax movl 0(%eax),%ebx # restore ebx *************** *** 1044,1049 **** --- 1087,1093 ---- * * Call should be made at spl6(), and p->p_stat should be SRUN */ + ALIGN32 ENTRY(setrq) movl 4(%esp),%eax cmpl $0,P_RLINK(%eax) # should not be on q already *************** *** 1070,1075 **** --- 1114,1120 ---- * * Call should be made at spl6(). */ + ALIGN32 ENTRY(remrq) movl 4(%esp),%eax movzbl P_PRI(%eax),%edx *************** *** 1106,1111 **** --- 1151,1157 ---- * to wait for something to come ready. */ .globl Idle + ALIGN32 Idle: idle: call _spl0 *************** *** 1123,1128 **** --- 1169,1175 ---- /* * Swtch() */ + ALIGN32 ENTRY(swtch) incl _cnt+V_SWTCH *************** *** 1254,1259 **** --- 1301,1307 ---- ret .globl _mvesp + ALIGN32 _mvesp: movl %esp,%eax ret /* *************** *** 1265,1270 **** --- 1313,1319 ---- * Since this code requires a parameter from the "old" stack, * pass it back as a return value. */ + ALIGN32 ENTRY(swtch_to_inactive) popl %edx # old pc popl %eax # arg, our return value *************** *** 1279,1284 **** --- 1328,1334 ---- * Update pcb, saving current processor state and arranging * for alternate return ala longjmp in swtch if altreturn is true. */ + ALIGN32 ENTRY(savectx) movl 4(%esp), %ecx movw _cpl, %ax *************** *** 1329,1334 **** --- 1379,1385 ---- * update profiling information for the user process. */ + ALIGN32 ENTRY(addupc) pushl %ebp movl %esp,%ebp *************** *** 1358,1363 **** --- 1409,1415 ---- leave ret + ALIGN32 proffault: /* if we get a fault, then kill profiling all together */ movl $0,PCB_ONFAULT(%edx) /* squish the fault handler */ *************** *** 1367,1372 **** --- 1419,1425 ---- ret .data + ALIGN32 .globl _cyloffset, _curpcb _cyloffset: .long 0 .globl _proc0paddr *************** *** 1466,1471 **** --- 1519,1525 ---- IDTVEC(rsvd14) pushl $0; TRAP(31) + ALIGN32 alltraps: pushal nop *************** *** 1491,1496 **** --- 1545,1551 ---- * This code checks for a kgdb trap, then falls through * to the regular trap code. */ + ALIGN32 bpttraps: pushal nop *************** *** 1511,1516 **** --- 1566,1572 ---- * Call gate entry for syscall */ + ALIGN32 IDTVEC(syscall) pushfl # only for stupid carry bit and more stupid wait3 cc kludge pushal # only need eax,ecx,edx - trap resaves others *************** *** 1529,1534 **** --- 1585,1591 ---- popfl lret + ALIGN32 ENTRY(htonl) ENTRY(ntohl) movl 4(%esp),%eax *************** *** 1537,1542 **** --- 1594,1600 ---- xchgb %al,%ah ret + ALIGN32 ENTRY(htons) ENTRY(ntohs) movzwl 4(%esp),%eax diff -rc i386.ORIG/isa/icu.s i386/isa/icu.s *** i386.ORIG/isa/icu.s Tue May 12 20:21:27 1992 --- i386/isa/icu.s Mon Oct 19 15:27:54 1992 *************** *** 43,48 **** --- 43,49 ---- */ .data + ALIGN32 .globl _imen .globl _cpl _cpl: .long 0xffff # current priority level (all off) *************** *** 62,67 **** --- 63,69 ---- /* * Handle return from interrupt after device handler finishes */ + ALIGN32 doreti: cli popl %ebx # remove intr number *************** *** 89,94 **** --- 91,97 ---- addl $8,%esp iret + ALIGN32 1: cmpl $0,_netisr # check for softint s/traps jne 1f cmpl $0,_want_resched *************** *** 102,107 **** --- 105,111 ---- #include "../net/netisr.h" + ALIGN32 1: #define DONET(s, c) ; \ *************** *** 171,176 **** --- 175,181 ---- .globl _splhigh .globl _splclock + ALIGN32 _splhigh: _splclock: cli # disable interrupts *************** *** 190,195 **** --- 195,201 ---- ret .globl _spltty # block clists + ALIGN32 _spltty: cli # disable interrupts NOP *************** *** 210,215 **** --- 216,222 ---- .globl _splimp .globl _splnet + ALIGN32 _splimp: _splnet: cli # disable interrupts *************** *** 230,235 **** --- 237,243 ---- ret .globl _splbio + ALIGN32 _splbio: cli # disable interrupts NOP *************** *** 249,254 **** --- 257,263 ---- ret .globl _splsoftclock + ALIGN32 _splsoftclock: cli # disable interrupts NOP *************** *** 269,274 **** --- 278,284 ---- .globl _splnone .globl _spl0 + ALIGN32 _splnone: _spl0: cli # disable interrupts *************** *** 307,312 **** --- 317,323 ---- ret .globl _splx + ALIGN32 _splx: cli # disable interrupts NOP -- UUCP: {hplabs!hp-sdd ucsd nosc}!crash!fpm ARPA: crash!fpm@nosc.mil INET: fpm@crash.cts.com